You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jr...@apache.org on 2016/07/26 23:04:53 UTC
[01/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Repository: incubator-impala
Updated Branches:
refs/heads/doc_prototype 0ad935b63 -> 463ddf924
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_varchar.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_varchar.xml b/docs/topics/impala_varchar.xml
new file mode 100644
index 0000000..32db4ae
--- /dev/null
+++ b/docs/topics/impala_varchar.xml
@@ -0,0 +1,215 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="varchar" rev="2.0.0">
+
+ <title>VARCHAR Data Type (CDH 5.2 or higher only)</title>
+ <titlealts><navtitle>VARCHAR (CDH 5.2 or higher only)</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Data Types"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">VARCHAR data type</indexterm>
+ A variable-length character type, truncated during processing if necessary to fit within the specified
+ length.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+ <p>
+ In the column definition of a <codeph>CREATE TABLE</codeph> statement:
+ </p>
+
+<codeblock><varname>column_name</varname> VARCHAR(<varname>max_length</varname>)</codeblock>
+
+ <p>
+ The maximum length you can specify is 65,535.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/partitioning_bad"/>
+
+<!--
+<p>
+This type can be used for partition key columns.
+Because of the efficiency advantage of numeric values over character-based values,
+if the partition key is a string representation of a number,
+prefer to use an integer data type with sufficient range (<codeph>INT</codeph>,
+<codeph>BIGINT</codeph>, and so on) rather than this type.
+</p>
+-->
+
+ <p conref="../shared/impala_common.xml#common/hbase_no"/>
+
+ <p conref="../shared/impala_common.xml#common/parquet_blurb"/>
+
+ <ul>
+ <li>
+ This type can be read from and written to Parquet files.
+ </li>
+
+ <li>
+ There is no requirement for a particular level of Parquet.
+ </li>
+
+ <li>
+ Parquet files generated by Impala and containing this type can be freely interchanged with other components
+ such as Hive and MapReduce.
+ </li>
+
+ <li>
+ Parquet data files can contain values that are longer than allowed by the
+ <codeph>VARCHAR(<varname>n</varname>)</codeph> length limit. Impala ignores any extra trailing characters
+ when it processes those values during a query.
+ </li>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/text_blurb"/>
+
+ <p>
+ Text data files can contain values that are longer than allowed by the
+ <codeph>VARCHAR(<varname>n</varname>)</codeph> length limit. Any extra trailing characters are ignored when
+ Impala processes those values during a query.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/schema_evolution_blurb"/>
+
+ <p>
+ You can use <codeph>ALTER TABLE ... CHANGE</codeph> to switch column data types to and from
+ <codeph>VARCHAR</codeph>. You can convert from <codeph>STRING</codeph> to
+ <codeph>VARCHAR(<varname>n</varname>)</codeph>, or from <codeph>VARCHAR(<varname>n</varname>)</codeph> to
+ <codeph>STRING</codeph>, or from <codeph>CHAR(<varname>n</varname>)</codeph> to
+ <codeph>VARCHAR(<varname>n</varname>)</codeph>, or from <codeph>VARCHAR(<varname>n</varname>)</codeph> to
+ <codeph>CHAR(<varname>n</varname>)</codeph>. When switching back and forth between <codeph>VARCHAR</codeph>
+ and <codeph>CHAR</codeph>, you can also change the length value. This schema evolution works the same for
+ tables using any file format. If a table contains values longer than the maximum length defined for a
+ <codeph>VARCHAR</codeph> column, Impala does not return an error. Any extra trailing characters are ignored
+ when Impala processes those values during a query.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/compatibility_blurb"/>
+
+ <p>
+ This type is available using Impala 2.0 or higher under CDH 4, or with Impala on CDH 5.2 or higher. There are
+ no compatibility issues with other components when exchanging data files or running Impala on CDH 4.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/internals_min_bytes"/>
+
+ <p conref="../shared/impala_common.xml#common/added_in_20"/>
+
+ <p conref="../shared/impala_common.xml#common/column_stats_variable"/>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/blobs_are_strings"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following examples show how long and short <codeph>VARCHAR</codeph> values are treated. Values longer
+ than the maximum specified length are truncated by <codeph>CAST()</codeph>, or when queried from existing
+ data files. Values shorter than the maximum specified length are represented as the actual length of the
+ value, with no extra padding as seen with <codeph>CHAR</codeph> values.
+ </p>
+
+<codeblock>create table varchar_1 (s varchar(1));
+create table varchar_4 (s varchar(4));
+create table varchar_20 (s varchar(20));
+
+insert into varchar_1 values (cast('a' as varchar(1))), (cast('b' as varchar(1))), (cast('hello' as varchar(1))), (cast('world' as varchar(1)));
+insert into varchar_4 values (cast('a' as varchar(4))), (cast('b' as varchar(4))), (cast('hello' as varchar(4))), (cast('world' as varchar(4)));
+insert into varchar_20 values (cast('a' as varchar(20))), (cast('b' as varchar(20))), (cast('hello' as varchar(20))), (cast('world' as varchar(20)));
+
+select * from varchar_1;
++---+
+| s |
++---+
+| a |
+| b |
+| h |
+| w |
++---+
+select * from varchar_4;
++------+
+| s |
++------+
+| a |
+| b |
+| hell |
+| worl |
++------+
+[localhost:21000] > select * from varchar_20;
++-------+
+| s |
++-------+
+| a |
+| b |
+| hello |
+| world |
++-------+
+select concat('[',s,']') as s from varchar_20;
++---------+
+| s |
++---------+
+| [a] |
+| [b] |
+| [hello] |
+| [world] |
++---------+
+</codeblock>
+
+ <p>
+ The following example shows how identical <codeph>VARCHAR</codeph> values compare as equal, even if the
+ columns are defined with different maximum lengths. Both tables contain <codeph>'a'</codeph> and
+ <codeph>'b'</codeph> values. The longer <codeph>'hello'</codeph> and <codeph>'world'</codeph> values from the
+ <codeph>VARCHAR_20</codeph> table were truncated when inserted into the <codeph>VARCHAR_1</codeph> table.
+ </p>
+
+<codeblock>select s from varchar_1 join varchar_20 using (s);
++-------+
+| s |
++-------+
+| a |
+| b |
++-------+
+</codeblock>
+
+ <p>
+ The following examples show how <codeph>VARCHAR</codeph> values are freely interchangeable with
+ <codeph>STRING</codeph> values in contexts such as comparison operators and built-in functions:
+ </p>
+
+<codeblock>select length(cast('foo' as varchar(100))) as length;
++--------+
+| length |
++--------+
+| 3 |
++--------+
+select cast('xyz' as varchar(5)) > cast('abc' as varchar(10)) as greater;
++---------+
+| greater |
++---------+
+| true |
++---------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/udf_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_string.xml#string"/>, <xref href="impala_char.xml#char"/>,
+ <xref href="impala_literals.xml#string_literals"/>,
+ <xref href="impala_string_functions.xml#string_functions"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_variance.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_variance.xml b/docs/topics/impala_variance.xml
new file mode 100644
index 0000000..e0c5d02
--- /dev/null
+++ b/docs/topics/impala_variance.xml
@@ -0,0 +1,127 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.4" id="variance">
+
+ <title>VARIANCE, VARIANCE_SAMP, VARIANCE_POP, VAR_SAMP, VAR_POP Functions</title>
+ <titlealts><navtitle>VARIANCE, VARIANCE_SAMP, VARIANCE_POP, VAR_SAMP, VAR_POP</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="Aggregate Functions"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">variance() function</indexterm>
+ <indexterm audience="Cloudera">variance_samp() function</indexterm>
+ <indexterm audience="Cloudera">variance_pop() function</indexterm>
+ <indexterm audience="Cloudera">var_samp() function</indexterm>
+ <indexterm audience="Cloudera">var_pop() function</indexterm>
+ An aggregate function that returns the
+ <xref href="http://en.wikipedia.org/wiki/Variance" scope="external" format="html">variance</xref> of a set of
+ numbers. This is a mathematical property that signifies how far the values spread apart from the mean. The
+ return value can be zero (if the input is a single value, or a set of identical values), or a positive number
+ otherwise.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>{ VARIANCE | VAR[IANCE]_SAMP | VAR[IANCE]_POP } ([DISTINCT | ALL] <varname>expression</varname>)</codeblock>
+
+ <p>
+ This function works with any numeric data type.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/former_odd_return_type_string"/>
+
+ <p>
+ This function is typically used in mathematical formulas related to probability distributions.
+ </p>
+
+ <p>
+ The <codeph>VARIANCE_SAMP()</codeph> and <codeph>VARIANCE_POP()</codeph> functions compute the sample
+ variance and population variance, respectively, of the input values. (<codeph>VARIANCE()</codeph> is an alias
+ for <codeph>VARIANCE_SAMP()</codeph>.) Both functions evaluate all input rows matched by the query. The
+ difference is that <codeph>STDDEV_SAMP()</codeph> is scaled by <codeph>1/(N-1)</codeph> while
+ <codeph>STDDEV_POP()</codeph> is scaled by <codeph>1/N</codeph>.
+ </p>
+
+ <p rev="2.0.0">
+ The functions <codeph>VAR_SAMP()</codeph> and <codeph>VAR_POP()</codeph> are the same as
+ <codeph>VARIANCE_SAMP()</codeph> and <codeph>VARIANCE_POP()</codeph>, respectively. These aliases are
+ available in Impala 2.0 and later.
+ </p>
+
+ <p>
+ If no input rows match the query, the result of any of these functions is <codeph>NULL</codeph>. If a single
+ input row matches the query, the result of any of these functions is <codeph>"0.0"</codeph>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ This example demonstrates how <codeph>VARIANCE()</codeph> and <codeph>VARIANCE_SAMP()</codeph> return the
+ same result, while <codeph>VARIANCE_POP()</codeph> uses a slightly different calculation to reflect that the
+ input data is considered part of a larger <q>population</q>.
+ </p>
+
+<codeblock>[localhost:21000] > select variance(score) from test_scores;
++-----------------+
+| variance(score) |
++-----------------+
+| 812.25 |
++-----------------+
+[localhost:21000] > select variance_samp(score) from test_scores;
++----------------------+
+| variance_samp(score) |
++----------------------+
+| 812.25 |
++----------------------+
+[localhost:21000] > select variance_pop(score) from test_scores;
++---------------------+
+| variance_pop(score) |
++---------------------+
+| 811.438 |
++---------------------+
+</codeblock>
+
+ <p>
+ This example demonstrates that, because the return value of these aggregate functions is a
+ <codeph>STRING</codeph>, you convert the result with <codeph>CAST</codeph> if you need to do further
+ calculations as a numeric value.
+ </p>
+
+<codeblock>[localhost:21000] > create table score_stats as select cast(stddev(score) as decimal(7,4)) `standard_deviation`, cast(variance(score) as decimal(7,4)) `variance` from test_scores;
++-------------------+
+| summary |
++-------------------+
+| Inserted 1 row(s) |
++-------------------+
+[localhost:21000] > desc score_stats;
++--------------------+--------------+---------+
+| name | type | comment |
++--------------------+--------------+---------+
+| standard_deviation | decimal(7,4) | |
+| variance | decimal(7,4) | |
++--------------------+--------------+---------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/analytic_not_allowed_caveat"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ The <codeph>STDDEV()</codeph>, <codeph>STDDEV_POP()</codeph>, and <codeph>STDDEV_SAMP()</codeph> functions
+ compute the standard deviation (square root of the variance) based on the results of
+ <codeph>VARIANCE()</codeph>, <codeph>VARIANCE_POP()</codeph>, and <codeph>VARIANCE_SAMP()</codeph>
+ respectively. See <xref href="impala_stddev.xml#stddev"/> for details about the standard deviation property.
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_views.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_views.xml b/docs/topics/impala_views.xml
new file mode 100644
index 0000000..a6c1a41
--- /dev/null
+++ b/docs/topics/impala_views.xml
@@ -0,0 +1,185 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.1" id="views">
+
+ <title>Overview of Impala Views</title>
+ <titlealts><navtitle>Views</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Tables"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ Views are lightweight logical constructs that act as aliases for queries. You can specify a view name in a
+ query (a <codeph>SELECT</codeph> statement or the <codeph>SELECT</codeph> portion of an
+ <codeph>INSERT</codeph> statement) where you would usually specify a table name.
+ </p>
+
+ <p>
+ A view lets you:
+ </p>
+
+ <ul>
+ <li>
+ Issue complicated queries with compact and simple syntax:
+<codeblock>-- Take a complicated reporting query, plug it into a CREATE VIEW statement...
+create view v1 as select c1, c2, avg(c3) from t1 group by c3 order by c1 desc limit 10;
+-- ... and now you can produce the report with 1 line of code.
+select * from v1;</codeblock>
+ </li>
+
+ <li>
+ Reduce maintenance, by avoiding the duplication of complicated queries across multiple applications in
+ multiple languages:
+<codeblock>create view v2 as select t1.c1, t1.c2, t2.c3 from t1 join t2 on (t1.id = t2.id);
+-- This simple query is safer to embed in reporting applications than the longer query above.
+-- The view definition can remain stable even if the structure of the underlying tables changes.
+select c1, c2, c3 from v2;</codeblock>
+ </li>
+
+ <li>
+ Build a new, more refined query on top of the original query by adding new clauses, select-list
+ expressions, function calls, and so on:
+<codeblock>create view average_price_by_category as select category, avg(price) as avg_price from products group by category;
+create view expensive_categories as select category, avg_price from average_price_by_category order by avg_price desc limit 10000;
+create view top_10_expensive_categories as select category, avg_price from expensive_categories limit 10;</codeblock>
+ This technique lets you build up several more or less granular variations of the same query, and switch
+ between them when appropriate.
+<!-- My original assumption was confirmed correct by Alex: outer ORDER BY not actually needed.
+In this case, we put an <codeph>ORDER BY</codeph> clause on the <q>top 10</q> view, even though there was already an <codeph>ORDER BY</codeph>
+on the <q>top 10000</q> view, because when a query is executed in parallel and distributed among multiple nodes, the ordering is only
+guaranteed if there is an <codeph>ORDER BY</codeph> clause at the outermost level.
+-->
+ </li>
+
+ <li>
+ Set up aliases with intuitive names for tables, columns, result sets from joins, and so on:
+<codeblock>-- The original tables might have cryptic names inherited from a legacy system.
+create view action_items as select rrptsk as assignee, treq as due_date, dmisc as notes from vxy_t1_br;
+-- You can leave original names for compatibility, build new applications using more intuitive ones.
+select assignee, due_date, notes from action_items;</codeblock>
+ </li>
+
+ <li>
+ Swap tables with others that use different file formats, partitioning schemes, and so on without any
+ downtime for data copying or conversion:
+<codeblock>create table slow (x int, s string) stored as textfile;
+create view report as select s from slow where x between 20 and 30;
+-- Query is kind of slow due to inefficient table definition, but it works.
+select * from report;
+
+create table fast (s string) partitioned by (x int) stored as parquet;
+-- ...Copy data from SLOW to FAST. Queries against REPORT view continue to work...
+
+-- After changing the view definition, queries will be faster due to partitioning,
+-- binary format, and compression in the new table.
+alter view report as select s from fast where x between 20 and 30;
+select * from report;</codeblock>
+ </li>
+
+ <li>
+ Avoid coding lengthy subqueries and repeating the same subquery text in many other queries.
+ </li>
+
+ <li rev="2.3.0 collevelauth">
+ Set up fine-grained security where a user can query some columns from a table but not other columns.
+ Because CDH 5.5 / Impala 2.3 and higher support column-level authorization, this technique is no longer
+ required. If you formerly implemented column-level security through views, see
+ <xref href="sg_hive_sql.xml#concept_c2q_4qx_p4/col_level_auth_sentry"/> for details about the
+ column-level authorization feature.
+ <!-- See <xref href="impala_authorization.xml#security_examples/sec_ex_views"/> for details. -->
+ </li>
+ </ul>
+
+ <p>
+ The SQL statements that configure views are <xref href="impala_create_view.xml#create_view"/>,
+ <xref href="impala_alter_view.xml#alter_view"/>, and <xref href="impala_drop_view.xml#drop_view"/>. You can
+ specify view names when querying data (<xref href="impala_select.xml#select"/>) and copying data from one
+ table to another (<xref href="impala_insert.xml#insert"/>). The <xref href="impala_with.xml#with">WITH</xref>
+ clause creates an inline view, that only exists for the duration of a single query.
+ </p>
+
+<codeblock>[localhost:21000] > create view trivial as select * from customer;
+[localhost:21000] > create view some_columns as select c_first_name, c_last_name, c_login from customer;
+[localhost:21000] > select * from some_columns limit 5;
+Query finished, fetching results ...
++--------------+-------------+---------+
+| c_first_name | c_last_name | c_login |
++--------------+-------------+---------+
+| Javier | Lewis | |
+| Amy | Moses | |
+| Latisha | Hamilton | |
+| Michael | White | |
+| Robert | Moran | |
++--------------+-------------+---------+
+[localhost:21000] > create view ordered_results as select * from some_columns order by c_last_name desc, c_first_name desc limit 1000;
+[localhost:21000] > select * from ordered_results limit 5;
+Query: select * from ordered_results limit 5
+Query finished, fetching results ...
++--------------+-------------+---------+
+| c_first_name | c_last_name | c_login |
++--------------+-------------+---------+
+| Thomas | Zuniga | |
+| Sarah | Zuniga | |
+| Norma | Zuniga | |
+| Lloyd | Zuniga | |
+| Lisa | Zuniga | |
++--------------+-------------+---------+
+Returned 5 row(s) in 0.48s</codeblock>
+
+ <p>
+ The previous example uses descending order for <codeph>ORDERED_RESULTS</codeph> because in the sample TPCD-H
+ data, there are some rows with empty strings for both <codeph>C_FIRST_NAME</codeph> and
+ <codeph>C_LAST_NAME</codeph>, making the lowest-ordered names unuseful in a sample query.
+ </p>
+
+<codeblock>create view visitors_by_day as select day, count(distinct visitors) as howmany from web_traffic group by day;
+create view top_10_days as select day, howmany from visitors_by_day order by howmany limit 10;
+select * from top_10_days;</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/describe_formatted_view"/>
+
+ <p conref="../shared/impala_common.xml#common/create_table_like_view"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_views"/>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <ul>
+ <li>
+ <p>
+ You cannot insert into an Impala view. (In some database systems, this operation is allowed and inserts
+ rows into the base table.) You can use a view name on the right-hand side of an <codeph>INSERT</codeph>
+ statement, in the <codeph>SELECT</codeph> part.
+ </p>
+ </li>
+
+ <li>
+<!-- This same text is conref'ed in the #views and the #partition_pruning topics. -->
+ <p conref="../shared/impala_common.xml#common/partitions_and_views"/>
+ </li>
+
+ <li rev="1.4.0">
+ <p conref="../shared/impala_common.xml#common/order_by_view_restriction"/>
+ </li>
+ </ul>
+
+ <p>
+ <b>Related statements:</b> <xref href="impala_create_view.xml#create_view"/>,
+ <xref href="impala_alter_view.xml#alter_view"/>, <xref href="impala_drop_view.xml#drop_view"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_with.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_with.xml b/docs/topics/impala_with.xml
new file mode 100644
index 0000000..8d1001c
--- /dev/null
+++ b/docs/topics/impala_with.xml
@@ -0,0 +1,64 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.1" id="with">
+
+ <title>WITH Clause</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ A clause that can be added before a <codeph>SELECT</codeph> statement, to define aliases for complicated
+ expressions that are referenced multiple times within the body of the <codeph>SELECT</codeph>. Similar to
+ <codeph>CREATE VIEW</codeph>, except that the table and column names defined in the <codeph>WITH</codeph>
+ clause do not persist after the query finishes, and do not conflict with names used in actual tables or
+ views. Also known as <q>subquery factoring</q>.
+ </p>
+
+ <p>
+ You can rewrite a query using subqueries to work the same as with the <codeph>WITH</codeph> clause. The
+ purposes of the <codeph>WITH</codeph> clause are:
+ </p>
+
+ <ul>
+ <li>
+ Convenience and ease of maintenance from less repetition with the body of the query. Typically used with
+ queries involving <codeph>UNION</codeph>, joins, or aggregation functions where the similar complicated
+ expressions are referenced multiple times.
+ </li>
+
+ <li>
+ SQL code that is easier to read and understand by abstracting the most complex part of the query into a
+ separate block.
+ </li>
+
+ <li>
+ Improved compatibility with SQL from other database systems that support the same clause (primarily Oracle
+ Database).
+ <note>
+ <p>
+ The Impala <codeph>WITH</codeph> clause does not support recursive queries in the
+ <codeph>WITH</codeph>, which is supported in some other database systems.
+ </p>
+ </note>
+ </li>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/sql1999"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>-- Define 2 subqueries that can be referenced from the body of a longer query.
+with t1 as (select 1), t2 as (select 2) insert into tab select * from t1 union all select * from t2;
+
+-- Define one subquery at the outer level, and another at the inner level as part of the
+-- initial stage of the UNION ALL query.
+with t1 as (select 1) (with t2 as (select 2) select * from t2) union all select * from t1;</codeblock>
+ </conbody>
+</concept>
[18/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Posted by jr...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_array.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_array.xml b/docs/topics/impala_array.xml
new file mode 100644
index 0000000..1e60795
--- /dev/null
+++ b/docs/topics/impala_array.xml
@@ -0,0 +1,266 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="array">
+
+ <title>ARRAY Complex Type (CDH 5.5 or higher only)</title>
+
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Data Types"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ A complex data type that can represent an arbitrary number of ordered elements.
+ The elements can be scalars or another complex type (<codeph>ARRAY</codeph>,
+ <codeph>STRUCT</codeph>, or <codeph>MAP</codeph>).
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<!-- To do: make sure there is sufficient syntax info under the SELECT statement to understand how to query all the complex types. -->
+
+<codeblock><varname>column_name</varname> ARRAY < <varname>type</varname> >
+
+type ::= <varname>primitive_type</varname> | <varname>complex_type</varname>
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_combo"/>
+
+ <p>
+ The elements of the array have no names. You refer to the value of the array item using the
+ <codeph>ITEM</codeph> pseudocolumn, or its position in the array with the <codeph>POS</codeph>
+ pseudocolumn. See <xref href="impala_complex_types.xml#item"/> for information about
+ these pseudocolumns.
+ </p>
+
+<!-- Array is a frequently used idiom; don't recommend MAP right up front, since that is more rarely used. STRUCT has all different considerations.
+ <p>
+ If it would be logical to have a fixed number of elements and give each one a name, consider using a
+ <codeph>MAP</codeph> (when all elements are of the same type) or a <codeph>STRUCT</codeph> (if different
+ elements have different types) instead of an <codeph>ARRAY</codeph>.
+ </p>
+-->
+
+ <p>
+ Each row can have a different number of elements (including none) in the array for that row.
+ </p>
+
+<!-- Since you don't use numeric indexes, this assertion and advice doesn't make sense.
+ <p>
+ If you attempt to refer to a non-existent array element, the result is <codeph>NULL</codeph>. Therefore,
+ when using operations such as addition or string concatenation involving array elements, you might use
+ conditional functions to substitute default values such as 0 or <codeph>""</codeph> in the place of missing
+ array elements.
+ </p>
+-->
+
+ <p>
+ When an array contains items of scalar types, you can use aggregation functions on the array elements without using join notation. For
+ example, you can find the <codeph>COUNT()</codeph>, <codeph>AVG()</codeph>, <codeph>SUM()</codeph>, and so on of numeric array
+ elements, or the <codeph>MAX()</codeph> and <codeph>MIN()</codeph> of any scalar array elements by referring to
+ <codeph><varname>table_name</varname>.<varname>array_column</varname></codeph> in the <codeph>FROM</codeph> clause of the query. When
+ you need to cross-reference values from the array with scalar values from the same row, such as by including a <codeph>GROUP
+ BY</codeph> clause to produce a separate aggregated result for each row, then the join clause is required.
+ </p>
+
+ <p>
+ A common usage pattern with complex types is to have an array as the top-level type for the column:
+ an array of structs, an array of maps, or an array of arrays.
+ For example, you can model a denormalized table by creating a column that is an <codeph>ARRAY</codeph>
+ of <codeph>STRUCT</codeph> elements; each item in the array represents a row from a table that would
+ normally be used in a join query. This kind of data structure lets you essentially denormalize tables by
+ associating multiple rows from one table with the matching row in another table.
+ </p>
+
+ <p>
+ You typically do not create more than one top-level <codeph>ARRAY</codeph> column, because if there is
+ some relationship between the elements of multiple arrays, it is convenient to model the data as
+ an array of another complex type element (either <codeph>STRUCT</codeph> or <codeph>MAP</codeph>).
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_describe"/>
+
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <ul conref="../shared/impala_common.xml#common/complex_types_restrictions">
+ <li/>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <note conref="../shared/impala_common.xml#common/complex_type_schema_pointer"/>
+
+ <p>
+ The following example shows how to construct a table with various kinds of <codeph>ARRAY</codeph> columns,
+ both at the top level and nested within other complex types.
+ Whenever the <codeph>ARRAY</codeph> consists of a scalar value, such as in the <codeph>PETS</codeph>
+ column or the <codeph>CHILDREN</codeph> field, you can see that future expansion is limited.
+ For example, you could not easily evolve the schema to record the kind of pet or the child's birthday alongside the name.
+ Therefore, it is more common to use an <codeph>ARRAY</codeph> whose elements are of <codeph>STRUCT</codeph> type,
+ to associate multiple fields with each array element.
+ </p>
+
+ <note>
+ Practice the <codeph>CREATE TABLE</codeph> and query notation for complex type columns
+ using empty tables, until you can visualize a complex data structure and construct corresponding SQL statements reliably.
+ </note>
+
+<!-- To do: verify and flesh out this example. -->
+
+<codeblock><![CDATA[CREATE TABLE array_demo
+(
+ id BIGINT,
+ name STRING,
+-- An ARRAY of scalar type as a top-level column.
+ pets ARRAY <STRING>,
+
+-- An ARRAY with elements of complex type (STRUCT).
+ places_lived ARRAY < STRUCT <
+ place: STRING,
+ start_year: INT
+ >>,
+
+-- An ARRAY as a field (CHILDREN) within a STRUCT.
+-- (The STRUCT is inside another ARRAY, because it is rare
+-- for a STRUCT to be a top-level column.)
+ marriages ARRAY < STRUCT <
+ spouse: STRING,
+ children: ARRAY <STRING>
+ >>,
+
+-- An ARRAY as the value part of a MAP.
+-- The first MAP field (the key) would be a value such as
+-- 'Parent' or 'Grandparent', and the corresponding array would
+-- represent 2 parents, 4 grandparents, and so on.
+ ancestors MAP < STRING, ARRAY <STRING> >
+)
+STORED AS PARQUET;
+]]>
+</codeblock>
+
+ <p>
+ The following example shows how to examine the structure of a table containing one or more <codeph>ARRAY</codeph> columns by using the
+ <codeph>DESCRIBE</codeph> statement. You can visualize each <codeph>ARRAY</codeph> as its own two-column table, with columns
+ <codeph>ITEM</codeph> and <codeph>POS</codeph>.
+ </p>
+
+<!-- To do: extend the examples to include MARRIAGES and ANCESTORS columns, or get rid of those columns. -->
+
+<codeblock><![CDATA[DESCRIBE array_demo;
++--------------+---------------------------+
+| name | type |
++--------------+---------------------------+
+| id | bigint |
+| name | string |
+| pets | array<string> |
+| marriages | array<struct< |
+| | spouse:string, |
+| | children:array<string> |
+| | >> |
+| places_lived | array<struct< |
+| | place:string, |
+| | start_year:int |
+| | >> |
+| ancestors | map<string,array<string>> |
++--------------+---------------------------+
+
+DESCRIBE array_demo.pets;
++------+--------+
+| name | type |
++------+--------+
+| item | string |
+| pos | bigint |
++------+--------+
+
+DESCRIBE array_demo.marriages;
++------+--------------------------+
+| name | type |
++------+--------------------------+
+| item | struct< |
+| | spouse:string, |
+| | children:array<string> |
+| | > |
+| pos | bigint |
++------+--------------------------+
+
+DESCRIBE array_demo.places_lived;
++------+------------------+
+| name | type |
++------+------------------+
+| item | struct< |
+| | place:string, |
+| | start_year:int |
+| | > |
+| pos | bigint |
++------+------------------+
+
+DESCRIBE array_demo.ancestors;
++-------+---------------+
+| name | type |
++-------+---------------+
+| key | string |
+| value | array<string> |
++-------+---------------+
+]]>
+</codeblock>
+
+ <p>
+ The following example shows queries involving <codeph>ARRAY</codeph> columns containing elements of scalar or complex types. You
+ <q>unpack</q> each <codeph>ARRAY</codeph> column by referring to it in a join query, as if it were a separate table with
+ <codeph>ITEM</codeph> and <codeph>POS</codeph> columns. If the array element is a scalar type, you refer to its value using the
+ <codeph>ITEM</codeph> pseudocolumn. If the array element is a <codeph>STRUCT</codeph>, you refer to the <codeph>STRUCT</codeph> fields
+ using dot notation and the field names. If the array element is another <codeph>ARRAY</codeph> or a <codeph>MAP</codeph>, you use
+ another level of join to unpack the nested collection elements.
+ </p>
+
+<!-- To do: have some sample output to show for these queries. -->
+
+<codeblock><![CDATA[-- Array of scalar values.
+-- Each array element represents a single string, plus we know its position in the array.
+SELECT id, name, pets.pos, pets.item FROM array_demo, array_demo.pets;
+
+-- Array of structs.
+-- Now each array element has named fields, possibly of different types.
+-- You can consider an ARRAY of STRUCT to represent a table inside another table.
+SELECT id, name, places_lived.pos, places_lived.item.place, places_lived.item.start_year
+FROM array_demo, array_demo.places_lived;
+
+-- The .ITEM name is optional for array elements that are structs.
+-- The following query is equivalent to the previous one, with .ITEM
+-- removed from the column references.
+SELECT id, name, places_lived.pos, places_lived.place, places_lived.start_year
+ FROM array_demo, array_demo.places_lived;
+
+-- To filter specific items from the array, do comparisons against the .POS or .ITEM
+-- pseudocolumns, or names of struct fields, in the WHERE clause.
+SELECT id, name, pets.item FROM array_demo, array_demo.pets
+ WHERE pets.pos in (0, 1, 3);
+
+SELECT id, name, pets.item FROM array_demo, array_demo.pets
+ WHERE pets.item LIKE 'Mr. %';
+
+SELECT id, name, places_lived.pos, places_lived.place, places_lived.start_year
+ FROM array_demo, array_demo.places_lived
+WHERE places_lived.place like '%California%';
+]]>
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_complex_types.xml#complex_types"/>,
+<!-- <xref href="impala_array.xml#array"/>, -->
+ <xref href="impala_struct.xml#struct"/>, <xref href="impala_map.xml#map"/>
+ </p>
+
+ </conbody>
+
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_avg.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_avg.xml b/docs/topics/impala_avg.xml
new file mode 100644
index 0000000..26f5450
--- /dev/null
+++ b/docs/topics/impala_avg.xml
@@ -0,0 +1,223 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="avg">
+
+ <title>AVG Function</title>
+ <titlealts><navtitle>AVG</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="Analytic Functions"/>
+ <data name="Category" value="Aggregate Functions"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">avg() function</indexterm>
+ An aggregate function that returns the average value from a set of numbers or <codeph>TIMESTAMP</codeph> values.
+ Its single argument can be numeric column, or the numeric result of a function or expression applied to the
+ column value. Rows with a <codeph>NULL</codeph> value for the specified column are ignored. If the table is empty,
+ or all the values supplied to <codeph>AVG</codeph> are <codeph>NULL</codeph>, <codeph>AVG</codeph> returns
+ <codeph>NULL</codeph>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>AVG([DISTINCT | ALL] <varname>expression</varname>) [OVER (<varname>analytic_clause</varname>)]
+</codeblock>
+
+ <p>
+ When the query contains a <codeph>GROUP BY</codeph> clause, returns one value for each combination of
+ grouping values.
+ </p>
+
+ <p>
+ <b>Return type:</b> <codeph>DOUBLE</codeph> for numeric values; <codeph>TIMESTAMP</codeph> for
+ <codeph>TIMESTAMP</codeph> values
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_aggregation_explanation"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_aggregation_example"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>-- Average all the non-NULL values in a column.
+insert overwrite avg_t values (2),(4),(6),(null),(null);
+-- The average of the above values is 4: (2+4+6) / 3. The 2 NULL values are ignored.
+select avg(x) from avg_t;
+-- Average only certain values from the column.
+select avg(x) from t1 where month = 'January' and year = '2013';
+-- Apply a calculation to the value of the column before averaging.
+select avg(x/3) from t1;
+-- Apply a function to the value of the column before averaging.
+-- Here we are substituting a value of 0 for all NULLs in the column,
+-- so that those rows do factor into the return value.
+select avg(isnull(x,0)) from t1;
+-- Apply some number-returning function to a string column and average the results.
+-- If column s contains any NULLs, length(s) also returns NULL and those rows are ignored.
+select avg(length(s)) from t1;
+-- Can also be used in combination with DISTINCT and/or GROUP BY.
+-- Return more than one result.
+select month, year, avg(page_visits) from web_stats group by month, year;
+-- Filter the input to eliminate duplicates before performing the calculation.
+select avg(distinct x) from t1;
+-- Filter the output after performing the calculation.
+select avg(x) from t1 group by y having avg(x) between 1 and 20;
+</codeblock>
+
+ <p rev="2.0.0">
+ The following examples show how to use <codeph>AVG()</codeph> in an analytic context. They use a table
+ containing integers from 1 to 10. Notice how the <codeph>AVG()</codeph> is reported for each input value, as
+ opposed to the <codeph>GROUP BY</codeph> clause which condenses the result set.
+<codeblock>select x, property, avg(x) over (partition by property) as avg from int_t where property in ('odd','even');
++----+----------+-----+
+| x | property | avg |
++----+----------+-----+
+| 2 | even | 6 |
+| 4 | even | 6 |
+| 6 | even | 6 |
+| 8 | even | 6 |
+| 10 | even | 6 |
+| 1 | odd | 5 |
+| 3 | odd | 5 |
+| 5 | odd | 5 |
+| 7 | odd | 5 |
+| 9 | odd | 5 |
++----+----------+-----+
+</codeblock>
+
+Adding an <codeph>ORDER BY</codeph> clause lets you experiment with results that are cumulative or apply to a moving
+set of rows (the <q>window</q>). The following examples use <codeph>AVG()</codeph> in an analytic context
+(that is, with an <codeph>OVER()</codeph> clause) to produce a running average of all the even values,
+then a running average of all the odd values. The basic <codeph>ORDER BY x</codeph> clause implicitly
+activates a window clause of <codeph>RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW</codeph>,
+which is effectively the same as <codeph>ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW</codeph>,
+therefore all of these examples produce the same results:
+<codeblock>select x, property,
+ avg(x) over (partition by property <b>order by x</b>) as 'cumulative average'
+ from int_t where property in ('odd','even');
++----+----------+--------------------+
+| x | property | cumulative average |
++----+----------+--------------------+
+| 2 | even | 2 |
+| 4 | even | 3 |
+| 6 | even | 4 |
+| 8 | even | 5 |
+| 10 | even | 6 |
+| 1 | odd | 1 |
+| 3 | odd | 2 |
+| 5 | odd | 3 |
+| 7 | odd | 4 |
+| 9 | odd | 5 |
++----+----------+--------------------+
+
+select x, property,
+ avg(x) over
+ (
+ partition by property
+ <b>order by x</b>
+ <b>range between unbounded preceding and current row</b>
+ ) as 'cumulative average'
+from int_t where property in ('odd','even');
++----+----------+--------------------+
+| x | property | cumulative average |
++----+----------+--------------------+
+| 2 | even | 2 |
+| 4 | even | 3 |
+| 6 | even | 4 |
+| 8 | even | 5 |
+| 10 | even | 6 |
+| 1 | odd | 1 |
+| 3 | odd | 2 |
+| 5 | odd | 3 |
+| 7 | odd | 4 |
+| 9 | odd | 5 |
++----+----------+--------------------+
+
+select x, property,
+ avg(x) over
+ (
+ partition by property
+ <b>order by x</b>
+ <b>rows between unbounded preceding and current row</b>
+ ) as 'cumulative average'
+ from int_t where property in ('odd','even');
++----+----------+--------------------+
+| x | property | cumulative average |
++----+----------+--------------------+
+| 2 | even | 2 |
+| 4 | even | 3 |
+| 6 | even | 4 |
+| 8 | even | 5 |
+| 10 | even | 6 |
+| 1 | odd | 1 |
+| 3 | odd | 2 |
+| 5 | odd | 3 |
+| 7 | odd | 4 |
+| 9 | odd | 5 |
++----+----------+--------------------+
+</codeblock>
+
+The following examples show how to construct a moving window, with a running average taking into account 1 row before
+and 1 row after the current row, within the same partition (all the even values or all the odd values).
+Because of a restriction in the Impala <codeph>RANGE</codeph> syntax, this type of
+moving window is possible with the <codeph>ROWS BETWEEN</codeph> clause but not the <codeph>RANGE BETWEEN</codeph>
+clause:
+<codeblock>select x, property,
+ avg(x) over
+ (
+ partition by property
+ <b>order by x</b>
+ <b>rows between 1 preceding and 1 following</b>
+ ) as 'moving average'
+ from int_t where property in ('odd','even');
++----+----------+----------------+
+| x | property | moving average |
++----+----------+----------------+
+| 2 | even | 3 |
+| 4 | even | 4 |
+| 6 | even | 6 |
+| 8 | even | 8 |
+| 10 | even | 9 |
+| 1 | odd | 2 |
+| 3 | odd | 3 |
+| 5 | odd | 5 |
+| 7 | odd | 7 |
+| 9 | odd | 8 |
++----+----------+----------------+
+
+-- Doesn't work because of syntax restriction on RANGE clause.
+select x, property,
+ avg(x) over
+ (
+ partition by property
+ <b>order by x</b>
+ <b>range between 1 preceding and 1 following</b>
+ ) as 'moving average'
+from int_t where property in ('odd','even');
+ERROR: AnalysisException: RANGE is only supported with both the lower and upper bounds UNBOUNDED or one UNBOUNDED and the other CURRENT ROW.
+</codeblock>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+<!-- This conref appears under SUM(), AVG(), FLOAT, and DOUBLE topics. -->
+
+ <p conref="../shared/impala_common.xml#common/sum_double"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_analytic_functions.xml#analytic_functions"/>, <xref href="impala_max.xml#max"/>,
+ <xref href="impala_min.xml#min"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_batch_size.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_batch_size.xml b/docs/topics/impala_batch_size.xml
new file mode 100644
index 0000000..13a4b18
--- /dev/null
+++ b/docs/topics/impala_batch_size.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="batch_size">
+
+ <title>BATCH_SIZE Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">BATCH_SIZE query option</indexterm>
+ Number of rows evaluated at a time by SQL operators. Unspecified or a size of 0 uses a predefined default
+ size. Using a large number improves responsiveness, especially for scan operations, at the cost of a higher memory footprint.
+ </p>
+
+ <p>
+ This option is primarily for Cloudera testing, or for use under the direction of Cloudera Support.
+ </p>
+
+ <p>
+ <b>Type:</b> numeric
+ </p>
+
+ <p>
+ <b>Default:</b> 0 (meaning the predefined default of 1024)
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_bigint.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_bigint.xml b/docs/topics/impala_bigint.xml
new file mode 100644
index 0000000..8f31bc6
--- /dev/null
+++ b/docs/topics/impala_bigint.xml
@@ -0,0 +1,100 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="bigint">
+
+ <title>BIGINT Data Type</title>
+ <titlealts><navtitle>BIGINT</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Data Types"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ An 8-byte integer data type used in <codeph>CREATE TABLE</codeph> and <codeph>ALTER TABLE</codeph>
+ statements.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+ <p>
+ In the column definition of a <codeph>CREATE TABLE</codeph> statement:
+ </p>
+
+<codeblock><varname>column_name</varname> BIGINT</codeblock>
+
+ <p>
+ <b>Range:</b> -9223372036854775808 .. 9223372036854775807. There is no <codeph>UNSIGNED</codeph> subtype.
+ </p>
+
+ <p>
+ <b>Conversions:</b> Impala automatically converts to a floating-point type (<codeph>FLOAT</codeph> or
+ <codeph>DOUBLE</codeph>) automatically. Use <codeph>CAST()</codeph> to convert to <codeph>TINYINT</codeph>,
+ <codeph>SMALLINT</codeph>, <codeph>INT</codeph>, <codeph>STRING</codeph>, or <codeph>TIMESTAMP</codeph>.
+ <ph conref="../shared/impala_common.xml#common/cast_int_to_timestamp"/>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>CREATE TABLE t1 (x BIGINT);
+SELECT CAST(1000 AS BIGINT);
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ <codeph>BIGINT</codeph> is a convenient type to use for column declarations because you can use any kind of
+ integer values in <codeph>INSERT</codeph> statements and they are promoted to <codeph>BIGINT</codeph> where
+ necessary. However, <codeph>BIGINT</codeph> also requires the most bytes of any integer type on disk and in
+ memory, meaning your queries are not as efficient and scalable as possible if you overuse this type.
+ Therefore, prefer to use the smallest integer type with sufficient range to hold all input values, and
+ <codeph>CAST()</codeph> when necessary to the appropriate type.
+ </p>
+
+ <p>
+ For a convenient and automated way to check the bounds of the <codeph>BIGINT</codeph> type, call the
+ functions <codeph>MIN_BIGINT()</codeph> and <codeph>MAX_BIGINT()</codeph>.
+ </p>
+
+ <p>
+ If an integer value is too large to be represented as a <codeph>BIGINT</codeph>, use a
+ <codeph>DECIMAL</codeph> instead with sufficient digits of precision.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/null_bad_numeric_cast"/>
+
+ <p conref="../shared/impala_common.xml#common/partitioning_good"/>
+
+ <p conref="../shared/impala_common.xml#common/hbase_ok"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/parquet_blurb"/> -->
+
+ <p conref="../shared/impala_common.xml#common/text_bulky"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/compatibility_blurb"/> -->
+
+ <p conref="../shared/impala_common.xml#common/internals_8_bytes"/>
+
+ <p conref="../shared/impala_common.xml#common/added_forever"/>
+
+ <p conref="../shared/impala_common.xml#common/column_stats_constant"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/restrictions_blurb"/> -->
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_literals.xml#numeric_literals"/>, <xref href="impala_tinyint.xml#tinyint"/>,
+ <xref href="impala_smallint.xml#smallint"/>, <xref href="impala_int.xml#int"/>,
+ <xref href="impala_bigint.xml#bigint"/>, <xref href="impala_decimal.xml#decimal"/>,
+ <xref href="impala_math_functions.xml#math_functions"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_bit_functions.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_bit_functions.xml b/docs/topics/impala_bit_functions.xml
new file mode 100644
index 0000000..77c7e5d
--- /dev/null
+++ b/docs/topics/impala_bit_functions.xml
@@ -0,0 +1,798 @@
+<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="bit_functions" rev="2.3.0">
+
+ <title>Impala Bit Functions</title>
+ <titlealts><navtitle>Bit Functions</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ Bit manipulation functions perform bitwise operations involved in scientific processing or computer science algorithms.
+ For example, these functions include setting, clearing, or testing bits within an integer value, or changing the
+ positions of bits with or without wraparound.
+ </p>
+
+ <p>
+ If a function takes two integer arguments that are required to be of the same type, the smaller argument is promoted
+ to the type of the larger one if required. For example, <codeph>BITAND(1,4096)</codeph> treats both arguments as
+ <codeph>SMALLINT</codeph>, because 1 can be represented as a <codeph>TINYINT</codeph> but 4096 requires a <codeph>SMALLINT</codeph>.
+ </p>
+
+ <p>
+ Remember that all Impala integer values are signed. Therefore, when dealing with binary values where the most significant
+ bit is 1, the specified or returned values might be negative when represented in base 10.
+ </p>
+
+ <p>
+ Whenever any argument is <codeph>NULL</codeph>, either the input value, bit position, or number of shift or rotate positions,
+ the return value from any of these functions is also <codeph>NULL</codeph>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ The bit functions operate on all the integral data types: <xref href="impala_int.xml#int"/>,
+ <xref href="impala_bigint.xml#bigint"/>, <xref href="impala_smallint.xml#smallint"/>, and
+ <xref href="impala_tinyint.xml#tinyint"/>.
+ </p>
+
+ <p>
+ <b>Function reference:</b>
+ </p>
+
+ <p>
+ Impala supports the following bit functions:
+ </p>
+
+<!--
+bitand
+bitnot
+bitor
+bitxor
+countset
+getbit
+rotateleft
+rotateright
+setbit
+shiftleft
+shiftright
+-->
+
+<!-- Include this conref for all the bit functions, all newly added in Impala 2.3.0.
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+-->
+
+ <dl>
+
+ <dlentry id="bitand">
+
+ <dt>
+ <codeph>bitand(integer_type a, same_type b)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">bitand() function</indexterm>
+ <b>Purpose:</b> Returns an integer value representing the bits that are set to 1 in both of the arguments.
+ If the arguments are of different sizes, the smaller is promoted to the type of the larger.
+ <p>
+ <b>Usage notes:</b> The <codeph>bitand()</codeph> function is equivalent to the <codeph>&</codeph> binary operator.
+ </p>
+ <p conref="../shared/impala_common.xml#common/return_type_same"/>
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p>
+ The following examples show the results of ANDing integer values.
+ 255 contains all 1 bits in its lowermost 7 bits.
+ 32767 contains all 1 bits in its lowermost 15 bits.
+ <!--
+ Negative numbers have a 1 in the sign bit and the value is the
+ <xref href="https://en.wikipedia.org/wiki/Two%27s_complement" scope="external" format="html">two's complement</xref>
+ of the positive equivalent.
+ -->
+ You can use the <codeph>bin()</codeph> function to check the binary representation of any
+ integer value, although the result is always represented as a 64-bit value.
+ If necessary, the smaller argument is promoted to the
+ type of the larger one.
+ </p>
+<codeblock>select bitand(255, 32767); /* 0000000011111111 & 0111111111111111 */
++--------------------+
+| bitand(255, 32767) |
++--------------------+
+| 255 |
++--------------------+
+
+select bitand(32767, 1); /* 0111111111111111 & 0000000000000001 */
++------------------+
+| bitand(32767, 1) |
++------------------+
+| 1 |
++------------------+
+
+select bitand(32, 16); /* 00010000 & 00001000 */
++----------------+
+| bitand(32, 16) |
++----------------+
+| 0 |
++----------------+
+
+select bitand(12,5); /* 00001100 & 00000101 */
++---------------+
+| bitand(12, 5) |
++---------------+
+| 4 |
++---------------+
+
+select bitand(-1,15); /* 11111111 & 00001111 */
++----------------+
+| bitand(-1, 15) |
++----------------+
+| 15 |
++----------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="bitnot">
+
+ <dt>
+ <codeph>bitnot(integer_type a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">bitnot() function</indexterm>
+ <b>Purpose:</b> Inverts all the bits of the input argument.
+ <p>
+ <b>Usage notes:</b> The <codeph>bitnot()</codeph> function is equivalent to the <codeph>~</codeph> unary operator.
+ </p>
+ <p conref="../shared/impala_common.xml#common/return_type_same"/>
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p>
+ These examples illustrate what happens when you flip all the bits of an integer value.
+ The sign always changes. The decimal representation is one different between the positive and
+ negative values.
+ <!--
+ because negative values are represented as the
+ <xref href="https://en.wikipedia.org/wiki/Two%27s_complement" scope="external" format="html">two's complement</xref>
+ of the corresponding positive value.
+ -->
+ </p>
+<codeblock>select bitnot(127); /* 01111111 -> 10000000 */
++-------------+
+| bitnot(127) |
++-------------+
+| -128 |
++-------------+
+
+select bitnot(16); /* 00010000 -> 11101111 */
++------------+
+| bitnot(16) |
++------------+
+| -17 |
++------------+
+
+select bitnot(0); /* 00000000 -> 11111111 */
++-----------+
+| bitnot(0) |
++-----------+
+| -1 |
++-----------+
+
+select bitnot(-128); /* 10000000 -> 01111111 */
++--------------+
+| bitnot(-128) |
++--------------+
+| 127 |
++--------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="bitor">
+
+ <dt>
+ <codeph>bitor(integer_type a, same_type b)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">bitor() function</indexterm>
+ <b>Purpose:</b> Returns an integer value representing the bits that are set to 1 in either of the arguments.
+ If the arguments are of different sizes, the smaller is promoted to the type of the larger.
+ <p>
+ <b>Usage notes:</b> The <codeph>bitor()</codeph> function is equivalent to the <codeph>|</codeph> binary operator.
+ </p>
+ <p conref="../shared/impala_common.xml#common/return_type_same"/>
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p>
+ The following examples show the results of ORing integer values.
+ </p>
+<codeblock>select bitor(1,4); /* 00000001 | 00000100 */
++-------------+
+| bitor(1, 4) |
++-------------+
+| 5 |
++-------------+
+
+select bitor(16,48); /* 00001000 | 00011000 */
++---------------+
+| bitor(16, 48) |
++---------------+
+| 48 |
++---------------+
+
+select bitor(0,7); /* 00000000 | 00000111 */
++-------------+
+| bitor(0, 7) |
++-------------+
+| 7 |
++-------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="bitxor">
+
+ <dt>
+ <codeph>bitxor(integer_type a, same_type b)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">bitxor() function</indexterm>
+ <b>Purpose:</b> Returns an integer value representing the bits that are set to 1 in one but not both of the arguments.
+ If the arguments are of different sizes, the smaller is promoted to the type of the larger.
+ <p>
+ <b>Usage notes:</b> The <codeph>bitxor()</codeph> function is equivalent to the <codeph>^</codeph> binary operator.
+ </p>
+ <p conref="../shared/impala_common.xml#common/return_type_same"/>
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p>
+ The following examples show the results of XORing integer values.
+ XORing a non-zero value with zero returns the non-zero value.
+ XORing two identical values returns zero, because all the 1 bits from the first argument are also 1 bits in the second argument.
+ XORing different non-zero values turns off some bits and leaves others turned on, based on whether the same bit is set in both arguments.
+ </p>
+<codeblock>select bitxor(0,15); /* 00000000 ^ 00001111 */
++---------------+
+| bitxor(0, 15) |
++---------------+
+| 15 |
++---------------+
+
+select bitxor(7,7); /* 00000111 ^ 00000111 */
++--------------+
+| bitxor(7, 7) |
++--------------+
+| 0 |
++--------------+
+
+select bitxor(8,4); /* 00001000 ^ 00000100 */
++--------------+
+| bitxor(8, 4) |
++--------------+
+| 12 |
++--------------+
+
+select bitxor(3,7); /* 00000011 ^ 00000111 */
++--------------+
+| bitxor(3, 7) |
++--------------+
+| 4 |
++--------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="countset">
+
+ <dt>
+ <codeph>countset(integer_type a [, int zero_or_one])</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">countset() function</indexterm>
+ <b>Purpose:</b> By default, returns the number of 1 bits in the specified integer value.
+ If the optional second argument is set to zero, it returns the number of 0 bits instead.
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ In discussions of information theory, this operation is referred to as the
+ <q><xref href="https://en.wikipedia.org/wiki/Hamming_weight" scope="external" format="html">population count</xref></q>
+ or <q>popcount</q>.
+ </p>
+ <p conref="../shared/impala_common.xml#common/return_type_same"/>
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p>
+ The following examples show how to count the number of 1 bits in an integer value.
+ </p>
+<codeblock>select countset(1); /* 00000001 */
++-------------+
+| countset(1) |
++-------------+
+| 1 |
++-------------+
+
+select countset(3); /* 00000011 */
++-------------+
+| countset(3) |
++-------------+
+| 2 |
++-------------+
+
+select countset(16); /* 00010000 */
++--------------+
+| countset(16) |
++--------------+
+| 1 |
++--------------+
+
+select countset(17); /* 00010001 */
++--------------+
+| countset(17) |
++--------------+
+| 2 |
++--------------+
+
+select countset(7,1); /* 00000111 = 3 1 bits; the function counts 1 bits by default */
++----------------+
+| countset(7, 1) |
++----------------+
+| 3 |
++----------------+
+
+select countset(7,0); /* 00000111 = 5 0 bits; third argument can only be 0 or 1 */
++----------------+
+| countset(7, 0) |
++----------------+
+| 5 |
++----------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="getbit">
+
+ <dt>
+ <codeph>getbit(integer_type a, int position)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">getbit() function</indexterm>
+ <b>Purpose:</b> Returns a 0 or 1 representing the bit at a
+ specified position. The positions are numbered right to left, starting at zero.
+ The position argument cannot be negative.
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ When you use a literal input value, it is treated as an 8-bit, 16-bit,
+ and so on value, the smallest type that is appropriate.
+ The type of the input value limits the range of the positions.
+ Cast the input value to the appropriate type if you need to
+ ensure it is treated as a 64-bit, 32-bit, and so on value.
+ </p>
+ <p conref="../shared/impala_common.xml#common/return_type_same"/>
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p>
+ The following examples show how to test a specific bit within an integer value.
+ </p>
+<codeblock>select getbit(1,0); /* 00000001 */
++--------------+
+| getbit(1, 0) |
++--------------+
+| 1 |
++--------------+
+
+select getbit(16,1) /* 00010000 */
++---------------+
+| getbit(16, 1) |
++---------------+
+| 0 |
++---------------+
+
+select getbit(16,4) /* 00010000 */
++---------------+
+| getbit(16, 4) |
++---------------+
+| 1 |
++---------------+
+
+select getbit(16,5) /* 00010000 */
++---------------+
+| getbit(16, 5) |
++---------------+
+| 0 |
++---------------+
+
+select getbit(-1,3); /* 11111111 */
++---------------+
+| getbit(-1, 3) |
++---------------+
+| 1 |
++---------------+
+
+select getbit(-1,25); /* 11111111 */
+ERROR: Invalid bit position: 25
+
+select getbit(cast(-1 as int),25); /* 11111111111111111111111111111111 */
++-----------------------------+
+| getbit(cast(-1 as int), 25) |
++-----------------------------+
+| 1 |
++-----------------------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="rotateleft">
+
+ <dt>
+ <codeph>rotateleft(integer_type a, int positions)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">rotateleft() function</indexterm>
+ <b>Purpose:</b> Rotates an integer value left by a specified number of bits.
+ As the most significant bit is taken out of the original value,
+ if it is a 1 bit, it is <q>rotated</q> back to the least significant bit.
+ Therefore, the final value has the same number of 1 bits as the original value,
+ just in different positions.
+ In computer science terms, this operation is a
+ <q><xref href="https://en.wikipedia.org/wiki/Circular_shift" scope="external" format="html">circular shift</xref></q>.
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ Specifying a second argument of zero leaves the original value unchanged.
+ Rotating a -1 value by any number of positions still returns -1,
+ because the original value has all 1 bits and all the 1 bits are
+ preserved during rotation.
+ Similarly, rotating a 0 value by any number of positions still returns 0.
+ Rotating a value by the same number of bits as in the value returns the same value.
+ Because this is a circular operation, the number of positions is not limited
+ to the number of bits in the input value.
+ For example, rotating an 8-bit value by 1, 9, 17, and so on positions returns an
+ identical result in each case.
+ </p>
+ <p conref="../shared/impala_common.xml#common/return_type_same"/>
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+<codeblock>select rotateleft(1,4); /* 00000001 -> 00010000 */
++------------------+
+| rotateleft(1, 4) |
++------------------+
+| 16 |
++------------------+
+
+select rotateleft(-1,155); /* 11111111 -> 11111111 */
++---------------------+
+| rotateleft(-1, 155) |
++---------------------+
+| -1 |
++---------------------+
+
+select rotateleft(-128,1); /* 10000000 -> 00000001 */
++---------------------+
+| rotateleft(-128, 1) |
++---------------------+
+| 1 |
++---------------------+
+
+select rotateleft(-127,3); /* 10000001 -> 00001100 */
++---------------------+
+| rotateleft(-127, 3) |
++---------------------+
+| 12 |
++---------------------+
+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="rotateright">
+
+ <dt>
+ <codeph>rotateright(integer_type a, int positions)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">rotateright() function</indexterm>
+ <b>Purpose:</b> Rotates an integer value right by a specified number of bits.
+ As the least significant bit is taken out of the original value,
+ if it is a 1 bit, it is <q>rotated</q> back to the most significant bit.
+ Therefore, the final value has the same number of 1 bits as the original value,
+ just in different positions.
+ In computer science terms, this operation is a
+ <q><xref href="https://en.wikipedia.org/wiki/Circular_shift" scope="external" format="html">circular shift</xref></q>.
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ Specifying a second argument of zero leaves the original value unchanged.
+ Rotating a -1 value by any number of positions still returns -1,
+ because the original value has all 1 bits and all the 1 bits are
+ preserved during rotation.
+ Similarly, rotating a 0 value by any number of positions still returns 0.
+ Rotating a value by the same number of bits as in the value returns the same value.
+ Because this is a circular operation, the number of positions is not limited
+ to the number of bits in the input value.
+ For example, rotating an 8-bit value by 1, 9, 17, and so on positions returns an
+ identical result in each case.
+ </p>
+ <p conref="../shared/impala_common.xml#common/return_type_same"/>
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+<codeblock>select rotateright(16,4); /* 00010000 -> 00000001 */
++--------------------+
+| rotateright(16, 4) |
++--------------------+
+| 1 |
++--------------------+
+
+select rotateright(-1,155); /* 11111111 -> 11111111 */
++----------------------+
+| rotateright(-1, 155) |
++----------------------+
+| -1 |
++----------------------+
+
+select rotateright(-128,1); /* 10000000 -> 01000000 */
++----------------------+
+| rotateright(-128, 1) |
++----------------------+
+| 64 |
++----------------------+
+
+select rotateright(-127,3); /* 10000001 -> 00110000 */
++----------------------+
+| rotateright(-127, 3) |
++----------------------+
+| 48 |
++----------------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="setbit">
+
+ <dt>
+ <codeph>setbit(integer_type a, int position [, int zero_or_one])</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">setbit() function</indexterm>
+ <b>Purpose:</b> By default, changes a bit at a specified position to a 1, if it is not already.
+ If the optional third argument is set to zero, the specified bit is set to 0 instead.
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ If the bit at the specified position was already 1 (by default)
+ or 0 (with a third argument of zero), the return value is
+ the same as the first argument.
+ The positions are numbered right to left, starting at zero.
+ (Therefore, the return value could be different from the first argument
+ even if the position argument is zero.)
+ The position argument cannot be negative.
+ <p>
+ When you use a literal input value, it is treated as an 8-bit, 16-bit,
+ and so on value, the smallest type that is appropriate.
+ The type of the input value limits the range of the positions.
+ Cast the input value to the appropriate type if you need to
+ ensure it is treated as a 64-bit, 32-bit, and so on value.
+ </p>
+ <p conref="../shared/impala_common.xml#common/return_type_same"/>
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+<codeblock>select setbit(0,0); /* 00000000 -> 00000001 */
++--------------+
+| setbit(0, 0) |
++--------------+
+| 1 |
++--------------+
+
+select setbit(0,3); /* 00000000 -> 00001000 */
++--------------+
+| setbit(0, 3) |
++--------------+
+| 8 |
++--------------+
+
+select setbit(7,3); /* 00000111 -> 00001111 */
++--------------+
+| setbit(7, 3) |
++--------------+
+| 15 |
++--------------+
+
+select setbit(15,3); /* 00001111 -> 00001111 */
++---------------+
+| setbit(15, 3) |
++---------------+
+| 15 |
++---------------+
+
+select setbit(0,32); /* By default, 0 is a TINYINT with only 8 bits. */
+ERROR: Invalid bit position: 32
+
+select setbit(cast(0 as bigint),32); /* For BIGINT, the position can be 0..63. */
++-------------------------------+
+| setbit(cast(0 as bigint), 32) |
++-------------------------------+
+| 4294967296 |
++-------------------------------+
+
+select setbit(7,3,1); /* 00000111 -> 00001111; setting to 1 is the default */
++-----------------+
+| setbit(7, 3, 1) |
++-----------------+
+| 15 |
++-----------------+
+
+select setbit(7,2,0); /* 00000111 -> 00000011; third argument of 0 clears instead of sets */
++-----------------+
+| setbit(7, 2, 0) |
++-----------------+
+| 3 |
++-----------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="shiftleft">
+
+ <dt>
+ <codeph>shiftleft(integer_type a, int positions)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">shiftleft() function</indexterm>
+ <b>Purpose:</b> Shifts an integer value left by a specified number of bits.
+ As the most significant bit is taken out of the original value,
+ it is discarded and the least significant bit becomes 0.
+ In computer science terms, this operation is a <q><xref href="https://en.wikipedia.org/wiki/Logical_shift" scope="external" format="html">logical shift</xref></q>.
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ The final value has either the same number of 1 bits as the original value, or fewer.
+ Shifting an 8-bit value by 8 positions, a 16-bit value by 16 positions, and so on produces
+ a result of zero.
+ </p>
+ <p>
+ Specifying a second argument of zero leaves the original value unchanged.
+ Shifting any value by 0 returns the original value.
+ Shifting any value by 1 is the same as multiplying it by 2,
+ as long as the value is small enough; larger values eventually
+ become negative when shifted, as the sign bit is set.
+ Starting with the value 1 and shifting it left by N positions gives
+ the same result as 2 to the Nth power, or <codeph>pow(2,<varname>N</varname>)</codeph>.
+ </p>
+ <p conref="../shared/impala_common.xml#common/return_type_same"/>
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+<codeblock>select shiftleft(1,0); /* 00000001 -> 00000001 */
++-----------------+
+| shiftleft(1, 0) |
++-----------------+
+| 1 |
++-----------------+
+
+select shiftleft(1,3); /* 00000001 -> 00001000 */
++-----------------+
+| shiftleft(1, 3) |
++-----------------+
+| 8 |
++-----------------+
+
+select shiftleft(8,2); /* 00001000 -> 00100000 */
++-----------------+
+| shiftleft(8, 2) |
++-----------------+
+| 32 |
++-----------------+
+
+select shiftleft(127,1); /* 01111111 -> 11111110 */
++-------------------+
+| shiftleft(127, 1) |
++-------------------+
+| -2 |
++-------------------+
+
+select shiftleft(127,5); /* 01111111 -> 11100000 */
++-------------------+
+| shiftleft(127, 5) |
++-------------------+
+| -32 |
++-------------------+
+
+select shiftleft(-1,4); /* 11111111 -> 11110000 */
++------------------+
+| shiftleft(-1, 4) |
++------------------+
+| -16 |
++------------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="shiftright">
+
+ <dt>
+ <codeph>shiftright(integer_type a, int positions)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">shiftright() function</indexterm>
+ <b>Purpose:</b> Shifts an integer value right by a specified number of bits.
+ As the least significant bit is taken out of the original value,
+ it is discarded and the most significant bit becomes 0.
+ In computer science terms, this operation is a <q><xref href="https://en.wikipedia.org/wiki/Logical_shift" scope="external" format="html">logical shift</xref></q>.
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ Therefore, the final value has either the same number of 1 bits as the original value, or fewer.
+ Shifting an 8-bit value by 8 positions, a 16-bit value by 16 positions, and so on produces
+ a result of zero.
+ </p>
+ <p>
+ Specifying a second argument of zero leaves the original value unchanged.
+ Shifting any value by 0 returns the original value.
+ Shifting any positive value right by 1 is the same as dividing it by 2.
+ Negative values become positive when shifted right.
+ </p>
+ <p conref="../shared/impala_common.xml#common/return_type_same"/>
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+<codeblock>select shiftright(16,0); /* 00010000 -> 00000000 */
++-------------------+
+| shiftright(16, 0) |
++-------------------+
+| 16 |
++-------------------+
+
+select shiftright(16,4); /* 00010000 -> 00000000 */
++-------------------+
+| shiftright(16, 4) |
++-------------------+
+| 1 |
++-------------------+
+
+select shiftright(16,5); /* 00010000 -> 00000000 */
++-------------------+
+| shiftright(16, 5) |
++-------------------+
+| 0 |
++-------------------+
+
+select shiftright(-1,1); /* 11111111 -> 01111111 */
++-------------------+
+| shiftright(-1, 1) |
++-------------------+
+| 127 |
++-------------------+
+
+select shiftright(-1,5); /* 11111111 -> 00000111 */
++-------------------+
+| shiftright(-1, 5) |
++-------------------+
+| 7 |
++-------------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ </dl>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_boolean.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_boolean.xml b/docs/topics/impala_boolean.xml
new file mode 100644
index 0000000..6a8e299
--- /dev/null
+++ b/docs/topics/impala_boolean.xml
@@ -0,0 +1,128 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="boolean">
+
+ <title>BOOLEAN Data Type</title>
+ <titlealts><navtitle>BOOLEAN</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Data Types"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ A data type used in <codeph>CREATE TABLE</codeph> and <codeph>ALTER TABLE</codeph> statements, representing a
+ single true/false choice.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+ <p>
+ In the column definition of a <codeph>CREATE TABLE</codeph> statement:
+ </p>
+
+<codeblock><varname>column_name</varname> BOOLEAN</codeblock>
+
+ <p>
+ <b>Range:</b> <codeph>TRUE</codeph> or <codeph>FALSE</codeph>. Do not use quotation marks around the
+ <codeph>TRUE</codeph> and <codeph>FALSE</codeph> literal values. You can write the literal values in
+ uppercase, lowercase, or mixed case. The values queried from a table are always returned in lowercase,
+ <codeph>true</codeph> or <codeph>false</codeph>.
+ </p>
+
+ <p>
+ <b>Conversions:</b> Impala does not automatically convert any other type to <codeph>BOOLEAN</codeph>. All
+ conversions must use an explicit call to the <codeph>CAST()</codeph> function.
+ </p>
+
+ <p>
+ You can use <codeph>CAST()</codeph> to convert <codeph>TINYINT</codeph>, <codeph>SMALLINT</codeph>,
+ <codeph>INT</codeph>, <codeph>BIGINT</codeph>, <codeph>FLOAT</codeph>, or <codeph>DOUBLE</codeph>
+<!-- any integer or floating-point type to -->
+ <codeph>BOOLEAN</codeph>: a value of 0 represents <codeph>false</codeph>, and any non-zero value is converted
+ to <codeph>true</codeph>.
+ </p>
+
+ <p rev="1.4.0">
+<!-- BOOLEAN-to-DECIMAL casting requested in IMPALA-991. As of Sept. 2014, designated "won't fix". -->
+ You can cast <codeph>DECIMAL</codeph> values to <codeph>BOOLEAN</codeph>, with the same treatment of zero and
+ non-zero values as the other numeric types. You cannot cast a <codeph>BOOLEAN</codeph> to a
+ <codeph>DECIMAL</codeph>.
+ </p>
+
+ <p>
+ You cannot cast a <codeph>STRING</codeph> value to <codeph>BOOLEAN</codeph>, although you can cast a
+ <codeph>BOOLEAN</codeph> value to <codeph>STRING</codeph>, returning <codeph>'1'</codeph> for
+ <codeph>true</codeph> values and <codeph>'0'</codeph> for <codeph>false</codeph> values.
+ </p>
+
+ <p>
+ Although you can cast a <codeph>TIMESTAMP</codeph> to a <codeph>BOOLEAN</codeph> or a
+ <codeph>BOOLEAN</codeph> to a <codeph>TIMESTAMP</codeph>, the results are unlikely to be useful. Any non-zero
+ <codeph>TIMESTAMP</codeph> (that is, any value other than <codeph>1970-01-01 00:00:00</codeph>) becomes
+ <codeph>TRUE</codeph> when converted to <codeph>BOOLEAN</codeph>, while <codeph>1970-01-01 00:00:00</codeph>
+ becomes <codeph>FALSE</codeph>. A value of <codeph>FALSE</codeph> becomes <codeph>1970-01-01
+ 00:00:00</codeph> when converted to <codeph>BOOLEAN</codeph>, and <codeph>TRUE</codeph> becomes one second
+ past this epoch date, that is, <codeph>1970-01-01 00:00:01</codeph>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/null_null_arguments"/>
+
+ <p conref="../shared/impala_common.xml#common/partitioning_blurb"/>
+
+ <p>
+ Do not use a <codeph>BOOLEAN</codeph> column as a partition key. Although you can create such a table,
+ subsequent operations produce errors:
+ </p>
+
+<codeblock>[localhost:21000] > create table truth_table (assertion string) partitioned by (truth boolean);
+[localhost:21000] > insert into truth_table values ('Pigs can fly',false);
+ERROR: AnalysisException: INSERT into table with BOOLEAN partition column (truth) is not supported: partitioning.truth_table
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>SELECT 1 < 2;
+SELECT 2 = 5;
+SELECT 100 < NULL, 100 > NULL;
+CREATE TABLE assertions (claim STRING, really BOOLEAN);
+INSERT INTO assertions VALUES
+ ("1 is less than 2", 1 < 2),
+ ("2 is the same as 5", 2 = 5),
+ ("Grass is green", true),
+ ("The moon is made of green cheese", false);
+SELECT claim FROM assertions WHERE really = TRUE;
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/hbase_ok"/>
+
+ <p conref="../shared/impala_common.xml#common/parquet_ok"/>
+
+ <p conref="../shared/impala_common.xml#common/text_bulky"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/compatibility_blurb"/> -->
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/internals_blurb"/> -->
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/added_in_20"/> -->
+
+ <p conref="../shared/impala_common.xml#common/column_stats_constant"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/restrictions_blurb"/> -->
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/related_info"/> -->
+
+ <p>
+ <b>Related information:</b> <xref href="impala_literals.xml#boolean_literals"/>,
+ <xref href="impala_operators.xml#operators"/>,
+ <xref href="impala_conditional_functions.xml#conditional_functions"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_char.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_char.xml b/docs/topics/impala_char.xml
new file mode 100644
index 0000000..68cabeb
--- /dev/null
+++ b/docs/topics/impala_char.xml
@@ -0,0 +1,275 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="char" rev="2.0.0">
+
+ <title>CHAR Data Type (CDH 5.2 or higher only)</title>
+ <titlealts><navtitle>CHAR (CDH 5.2 or higher only)</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Data Types"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">CHAR data type</indexterm>
+ A fixed-length character type, padded with trailing spaces if necessary to achieve the specified length. If
+ values are longer than the specified length, Impala truncates any trailing characters.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+ <p>
+ In the column definition of a <codeph>CREATE TABLE</codeph> statement:
+ </p>
+
+<codeblock><varname>column_name</varname> CHAR(<varname>length</varname>)</codeblock>
+
+ <p>
+ The maximum length you can specify is 255.
+ </p>
+
+ <p>
+ <b>Semantics of trailing spaces:</b>
+ </p>
+
+ <ul>
+ <li>
+ When you store a <codeph>CHAR</codeph> value shorter than the specified length in a table, queries return
+ the value padded with trailing spaces if necessary; the resulting value has the same length as specified in
+ the column definition.
+ </li>
+
+ <li>
+ If you store a <codeph>CHAR</codeph> value containing trailing spaces in a table, those trailing spaces are
+ not stored in the data file. When the value is retrieved by a query, the result could have a different
+ number of trailing spaces. That is, the value includes however many spaces are needed to pad it to the
+ specified length of the column.
+ </li>
+
+ <li>
+ If you compare two <codeph>CHAR</codeph> values that differ only in the number of trailing spaces, those
+ values are considered identical.
+ </li>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/partitioning_bad"/>
+
+ <p conref="../shared/impala_common.xml#common/hbase_no"/>
+
+ <p conref="../shared/impala_common.xml#common/parquet_blurb"/>
+
+ <ul>
+ <li>
+ This type can be read from and written to Parquet files.
+ </li>
+
+ <li>
+ There is no requirement for a particular level of Parquet.
+ </li>
+
+ <li>
+ Parquet files generated by Impala and containing this type can be freely interchanged with other components
+ such as Hive and MapReduce.
+ </li>
+
+ <li>
+ Any trailing spaces, whether implicitly or explicitly specified, are not written to the Parquet data files.
+ </li>
+
+ <li>
+ Parquet data files might contain values that are longer than allowed by the
+ <codeph>CHAR(<varname>n</varname>)</codeph> length limit. Impala ignores any extra trailing characters when
+ it processes those values during a query.
+ </li>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/text_blurb"/>
+
+ <p>
+ Text data files might contain values that are longer than allowed for a particular
+ <codeph>CHAR(<varname>n</varname>)</codeph> column. Any extra trailing characters are ignored when Impala
+ processes those values during a query. Text data files can also contain values that are shorter than the
+ defined length limit, and Impala pads them with trailing spaces up to the specified length. Any text data
+ files produced by Impala <codeph>INSERT</codeph> statements do not include any trailing blanks for
+ <codeph>CHAR</codeph> columns.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/compatibility_blurb"/>
+
+ <p>
+ This type is available using Impala 2.0 or higher under CDH 4, or with Impala on CDH 5.2 or higher. There are
+ no compatibility issues with other components when exchanging data files or running Impala on CDH 4.
+ </p>
+
+ <p>
+ Some other database systems make the length specification optional. For Impala, the length is required.
+ </p>
+
+<!--
+<p>
+The Impala maximum length is larger than for the <codeph>CHAR</codeph> data type in Hive.
+If a Hive query encounters a <codeph>CHAR</codeph> value longer than 255 during processing,
+it silently treats the value as length 255.
+</p>
+-->
+
+ <p conref="../shared/impala_common.xml#common/internals_max_bytes"/>
+
+ <p conref="../shared/impala_common.xml#common/added_in_20"/>
+
+ <p conref="../shared/impala_common.xml#common/column_stats_constant"/>
+
+<!-- Seems like a logical design decision but don't think it's currently implemented like this.
+<p>
+Because both the maximum and average length are always known and always the same for
+any given <codeph>CHAR(<varname>n</varname>)</codeph> column, those fields are always filled
+in for <codeph>SHOW COLUMN STATS</codeph> output, even before you run
+<codeph>COMPUTE STATS</codeph> on the table.
+</p>
+-->
+
+ <p conref="../shared/impala_common.xml#common/udf_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ These examples show how trailing spaces are not considered significant when comparing or processing
+ <codeph>CHAR</codeph> values. <codeph>CAST()</codeph> truncates any longer string to fit within the defined
+ length. If a <codeph>CHAR</codeph> value is shorter than the specified length, it is padded on the right with
+ spaces until it matches the specified length. Therefore, <codeph>LENGTH()</codeph> represents the length
+ including any trailing spaces, and <codeph>CONCAT()</codeph> also treats the column value as if it has
+ trailing spaces.
+ </p>
+
+<codeblock>select cast('x' as char(4)) = cast('x ' as char(4)) as "unpadded equal to padded";
++--------------------------+
+| unpadded equal to padded |
++--------------------------+
+| true |
++--------------------------+
+
+create table char_length(c char(3));
+insert into char_length values (cast('1' as char(3))), (cast('12' as char(3))), (cast('123' as char(3))), (cast('123456' as char(3)));
+select concat("[",c,"]") as c, length(c) from char_length;
++-------+-----------+
+| c | length(c) |
++-------+-----------+
+| [1 ] | 3 |
+| [12 ] | 3 |
+| [123] | 3 |
+| [123] | 3 |
++-------+-----------+
+</codeblock>
+
+ <p>
+ This example shows a case where data values are known to have a specific length, where <codeph>CHAR</codeph>
+ is a logical data type to use.
+<!--
+Because all the <codeph>CHAR</codeph> values have a constant predictable length,
+Impala can efficiently analyze how best to use these values in join queries,
+aggregation queries, and other contexts where column length is significant.
+-->
+ </p>
+
+<codeblock>create table addresses
+ (id bigint,
+ street_name string,
+ state_abbreviation char(2),
+ country_abbreviation char(2));
+</codeblock>
+
+ <p>
+ The following example shows how values written by Impala do not physically include the trailing spaces. It
+ creates a table using text format, with <codeph>CHAR</codeph> values much shorter than the declared length,
+ and then prints the resulting data file to show that the delimited values are not separated by spaces. The
+ same behavior applies to binary-format Parquet data files.
+ </p>
+
+<codeblock>create table char_in_text (a char(20), b char(30), c char(40))
+ row format delimited fields terminated by ',';
+
+insert into char_in_text values (cast('foo' as char(20)), cast('bar' as char(30)), cast('baz' as char(40))), (cast('hello' as char(20)), cast('goodbye' as char(30)), cast('aloha' as char(40)));
+
+-- Running this Linux command inside impala-shell using the ! shortcut.
+!hdfs dfs -cat 'hdfs://127.0.0.1:8020/user/hive/warehouse/impala_doc_testing.db/char_in_text/*.*';
+foo,bar,baz
+hello,goodbye,aloha
+</codeblock>
+
+ <p>
+ The following example further illustrates the treatment of spaces. It replaces the contents of the previous
+ table with some values including leading spaces, trailing spaces, or both. Any leading spaces are preserved
+ within the data file, but trailing spaces are discarded. Then when the values are retrieved by a query, the
+ leading spaces are retrieved verbatim while any necessary trailing spaces are supplied by Impala.
+ </p>
+
+<codeblock>insert overwrite char_in_text values (cast('trailing ' as char(20)), cast(' leading and trailing ' as char(30)), cast(' leading' as char(40)));
+!hdfs dfs -cat 'hdfs://127.0.0.1:8020/user/hive/warehouse/impala_doc_testing.db/char_in_text/*.*';
+trailing, leading and trailing, leading
+
+select concat('[',a,']') as a, concat('[',b,']') as b, concat('[',c,']') as c from char_in_text;
++------------------------+----------------------------------+--------------------------------------------+
+| a | b | c |
++------------------------+----------------------------------+--------------------------------------------+
+| [trailing ] | [ leading and trailing ] | [ leading ] |
++------------------------+----------------------------------+--------------------------------------------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <p>
+ Because the blank-padding behavior requires allocating the maximum length for each value in memory, for
+ scalability reasons avoid declaring <codeph>CHAR</codeph> columns that are much longer than typical values in
+ that column.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/blobs_are_strings"/>
+
+ <p>
+ When an expression compares a <codeph>CHAR</codeph> with a <codeph>STRING</codeph> or
+ <codeph>VARCHAR</codeph>, the <codeph>CHAR</codeph> value is implicitly converted to <codeph>STRING</codeph>
+ first, with trailing spaces preserved.
+ </p>
+
+<codeblock>select cast("foo " as char(5)) = 'foo' as "char equal to string";
++----------------------+
+| char equal to string |
++----------------------+
+| false |
++----------------------+
+</codeblock>
+
+ <p>
+ This behavior differs from other popular database systems. To get the expected result of
+ <codeph>TRUE</codeph>, cast the expressions on both sides to <codeph>CHAR</codeph> values of the appropriate
+ length:
+ </p>
+
+<codeblock>select cast("foo " as char(5)) = cast('foo' as char(3)) as "char equal to string";
++----------------------+
+| char equal to string |
++----------------------+
+| true |
++----------------------+
+</codeblock>
+
+ <p>
+ This behavior is subject to change in future releases.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_string.xml#string"/>, <xref href="impala_varchar.xml#varchar"/>,
+ <xref href="impala_literals.xml#string_literals"/>,
+ <xref href="impala_string_functions.xml#string_functions"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_comments.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_comments.xml b/docs/topics/impala_comments.xml
new file mode 100644
index 0000000..96b9479
--- /dev/null
+++ b/docs/topics/impala_comments.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="comments">
+
+ <title>Comments</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">comments (SQL)</indexterm>
+ Impala supports the familiar styles of SQL comments:
+ </p>
+
+ <ul>
+ <li>
+ All text from a <codeph>--</codeph> sequence to the end of the line is considered a comment and ignored.
+ This type of comment can occur on a single line by itself, or after all or part of a statement.
+ </li>
+
+ <li>
+ All text from a <codeph>/*</codeph> sequence to the next <codeph>*/</codeph> sequence is considered a
+ comment and ignored. This type of comment can stretch over multiple lines. This type of comment can occur
+ on one or more lines by itself, in the middle of a statement, or before or after a statement.
+ </li>
+ </ul>
+
+ <p>
+ For example:
+ </p>
+
+<codeblock>-- This line is a comment about a table.
+create table ...;
+
+/*
+This is a multi-line comment about a query.
+*/
+select ...;
+
+select * from t /* This is an embedded comment about a query. */ where ...;
+
+select * from t -- This is a trailing comment within a multi-line command.
+where ...;
+</codeblock>
+ </conbody>
+</concept>
[16/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Posted by jr...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_compression_codec.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_compression_codec.xml b/docs/topics/impala_compression_codec.xml
new file mode 100644
index 0000000..d99ac04
--- /dev/null
+++ b/docs/topics/impala_compression_codec.xml
@@ -0,0 +1,95 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="2.0.0" id="compression_codec">
+
+ <title>COMPRESSION_CODEC Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ <data name="Category" value="Compression"/>
+ <data name="Category" value="File Formats"/>
+ <data name="Category" value="Parquet"/>
+ <data name="Category" value="Snappy"/>
+ <data name="Category" value="GZip"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+<!-- The initial part of this paragraph is copied straight from the #parquet_compression topic. -->
+
+<!-- Could turn into a conref. -->
+
+ <p>
+ <indexterm audience="Cloudera">COMPRESSION_CODEC query option</indexterm>
+ When Impala writes Parquet data files using the <codeph>INSERT</codeph> statement, the underlying compression
+ is controlled by the <codeph>COMPRESSION_CODEC</codeph> query option.
+ </p>
+
+ <note>
+ Prior to Impala 2.0, this option was named <codeph>PARQUET_COMPRESSION_CODEC</codeph>. In Impala 2.0 and
+ later, the <codeph>PARQUET_COMPRESSION_CODEC</codeph> name is not recognized. Use the more general name
+ <codeph>COMPRESSION_CODEC</codeph> for new code.
+ </note>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>SET COMPRESSION_CODEC=<varname>codec_name</varname>;</codeblock>
+
+ <p>
+ The allowed values for this query option are <codeph>SNAPPY</codeph> (the default), <codeph>GZIP</codeph>,
+ and <codeph>NONE</codeph>.
+ </p>
+
+ <note>
+ A Parquet file created with <codeph>COMPRESSION_CODEC=NONE</codeph> is still typically smaller than the
+ original data, due to encoding schemes such as run-length encoding and dictionary encoding that are applied
+ separately from compression.
+ </note>
+
+ <p></p>
+
+ <p>
+ The option value is not case-sensitive.
+ </p>
+
+ <p>
+ If the option is set to an unrecognized value, all kinds of queries will fail due to the invalid option
+ setting, not just queries involving Parquet tables. (The value <codeph>BZIP2</codeph> is also recognized, but
+ is not compatible with Parquet tables.)
+ </p>
+
+ <p>
+ <b>Type:</b> <codeph>STRING</codeph>
+ </p>
+
+ <p>
+ <b>Default:</b> SNAPPY
+ </p>
+
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>set compression_codec=gzip;
+insert into parquet_table_highly_compressed select * from t1;
+
+set compression_codec=snappy;
+insert into parquet_table_compression_plus_fast_queries select * from t1;
+
+set compression_codec=none;
+insert into parquet_table_no_compression select * from t1;
+
+set compression_codec=foo;
+select * from t1 limit 5;
+ERROR: Invalid compression codec: foo
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ For information about how compressing Parquet data files affects query performance, see
+ <xref href="impala_parquet.xml#parquet_compression"/>.
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_compute_stats.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_compute_stats.xml b/docs/topics/impala_compute_stats.xml
new file mode 100644
index 0000000..abf6645
--- /dev/null
+++ b/docs/topics/impala_compute_stats.xml
@@ -0,0 +1,418 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.2.2" id="compute_stats">
+
+ <title>COMPUTE STATS Statement</title>
+ <titlealts><navtitle>COMPUTE STATS</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Performance"/>
+ <data name="Category" value="Scalability"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Tables"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">COMPUTE STATS statement</indexterm>
+ Gathers information about volume and distribution of data in a table and all associated columns and
+ partitions. The information is stored in the metastore database, and used by Impala to help optimize queries.
+ For example, if Impala can determine that a table is large or small, or has many or few distinct values it
+ can organize parallelize the work appropriately for a join query or insert operation. For details about the
+ kinds of information gathered by this statement, see <xref href="impala_perf_stats.xml#perf_stats"/>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock rev="2.1.0">COMPUTE STATS [<varname>db_name</varname>.]<varname>table_name</varname>
+COMPUTE INCREMENTAL STATS [<varname>db_name</varname>.]<varname>table_name</varname> [PARTITION (<varname>partition_spec</varname>)]
+
+<varname>partition_spec</varname> ::= <varname>partition_col</varname>=<varname>constant_value</varname>
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/incremental_partition_spec"/>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ Originally, Impala relied on users to run the Hive <codeph>ANALYZE TABLE</codeph> statement, but that method
+ of gathering statistics proved unreliable and difficult to use. The Impala <codeph>COMPUTE STATS</codeph>
+ statement is built from the ground up to improve the reliability and user-friendliness of this operation.
+ <codeph>COMPUTE STATS</codeph> does not require any setup steps or special configuration. You only run a
+ single Impala <codeph>COMPUTE STATS</codeph> statement to gather both table and column statistics, rather
+ than separate Hive <codeph>ANALYZE TABLE</codeph> statements for each kind of statistics.
+ </p>
+
+ <p rev="2.1.0">
+ The <codeph>COMPUTE INCREMENTAL STATS</codeph> variation is a shortcut for partitioned tables that works on a
+ subset of partitions rather than the entire table. The incremental nature makes it suitable for large tables
+ with many partitions, where a full <codeph>COMPUTE STATS</codeph> operation takes too long to be practical
+ each time a partition is added or dropped. See <xref href="impala_perf_stats.xml#perf_stats_incremental"/>
+ for full usage details.
+ </p>
+
+ <p>
+ <codeph>COMPUTE INCREMENTAL STATS</codeph> only applies to partitioned tables. If you use the
+ <codeph>INCREMENTAL</codeph> clause for an unpartitioned table, Impala automatically uses the original
+ <codeph>COMPUTE STATS</codeph> statement. Such tables display <codeph>false</codeph> under the
+ <codeph>Incremental stats</codeph> column of the <codeph>SHOW TABLE STATS</codeph> output.
+ </p>
+
+ <note>
+ Because many of the most performance-critical and resource-intensive operations rely on table and column
+ statistics to construct accurate and efficient plans, <codeph>COMPUTE STATS</codeph> is an important step at
+ the end of your ETL process. Run <codeph>COMPUTE STATS</codeph> on all tables as your first step during
+ performance tuning for slow queries, or troubleshooting for out-of-memory conditions:
+ <ul>
+ <li>
+ Accurate statistics help Impala construct an efficient query plan for join queries, improving performance
+ and reducing memory usage.
+ </li>
+
+ <li>
+ Accurate statistics help Impala distribute the work effectively for insert operations into Parquet
+ tables, improving performance and reducing memory usage.
+ </li>
+
+ <li rev="1.3.0">
+ Accurate statistics help Impala estimate the memory required for each query, which is important when you
+ use resource management features, such as admission control and the YARN resource management framework.
+ The statistics help Impala to achieve high concurrency, full utilization of available memory, and avoid
+ contention with workloads from other Hadoop components.
+ </li>
+ </ul>
+ </note>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p rev="2.3.0">
+ Currently, the statistics created by the <codeph>COMPUTE STATS</codeph> statement do not include
+ information about complex type columns. The column stats metrics for complex columns are always shown
+ as -1. For queries involving complex type columns, Impala uses
+ heuristics to estimate the data distribution within such columns.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/hbase_blurb"/>
+
+ <p>
+ <codeph>COMPUTE STATS</codeph> works for HBase tables also. The statistics gathered for HBase tables are
+ somewhat different than for HDFS-backed tables, but that metadata is still used for optimization when HBase
+ tables are involved in join queries.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/s3_blurb"/>
+
+ <p rev="2.2.0">
+ <codeph>COMPUTE STATS</codeph> also works for tables where data resides in the Amazon Simple Storage Service (S3).
+ See <xref href="impala_s3.xml#s3"/> for details.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/performance_blurb"/>
+
+ <p>
+ The statistics collected by <codeph>COMPUTE STATS</codeph> are used to optimize join queries
+ <codeph>INSERT</codeph> operations into Parquet tables, and other resource-intensive kinds of SQL statements.
+ See <xref href="impala_perf_stats.xml#perf_stats"/> for details.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ This example shows two tables, <codeph>T1</codeph> and <codeph>T2</codeph>, with a small number distinct
+ values linked by a parent-child relationship between <codeph>T1.ID</codeph> and <codeph>T2.PARENT</codeph>.
+ <codeph>T1</codeph> is tiny, while <codeph>T2</codeph> has approximately 100K rows. Initially, the statistics
+ includes physical measurements such as the number of files, the total size, and size measurements for
+ fixed-length columns such as with the <codeph>INT</codeph> type. Unknown values are represented by -1. After
+ running <codeph>COMPUTE STATS</codeph> for each table, much more information is available through the
+ <codeph>SHOW STATS</codeph> statements. If you were running a join query involving both of these tables, you
+ would need statistics for both tables to get the most effective optimization for the query.
+ </p>
+
+<!-- Note: chopped off any excess characters at position 87 and after,
+ to avoid weird wrapping in PDF.
+ Applies to any subsequent examples with output from SHOW ... STATS too. -->
+
+<codeblock>[localhost:21000] > show table stats t1;
+Query: show table stats t1
++-------+--------+------+--------+
+| #Rows | #Files | Size | Format |
++-------+--------+------+--------+
+| -1 | 1 | 33B | TEXT |
++-------+--------+------+--------+
+Returned 1 row(s) in 0.02s
+[localhost:21000] > show table stats t2;
+Query: show table stats t2
++-------+--------+----------+--------+
+| #Rows | #Files | Size | Format |
++-------+--------+----------+--------+
+| -1 | 28 | 960.00KB | TEXT |
++-------+--------+----------+--------+
+Returned 1 row(s) in 0.01s
+[localhost:21000] > show column stats t1;
+Query: show column stats t1
++--------+--------+------------------+--------+----------+----------+
+| Column | Type | #Distinct Values | #Nulls | Max Size | Avg Size |
++--------+--------+------------------+--------+----------+----------+
+| id | INT | -1 | -1 | 4 | 4 |
+| s | STRING | -1 | -1 | -1 | -1 |
++--------+--------+------------------+--------+----------+----------+
+Returned 2 row(s) in 1.71s
+[localhost:21000] > show column stats t2;
+Query: show column stats t2
++--------+--------+------------------+--------+----------+----------+
+| Column | Type | #Distinct Values | #Nulls | Max Size | Avg Size |
++--------+--------+------------------+--------+----------+----------+
+| parent | INT | -1 | -1 | 4 | 4 |
+| s | STRING | -1 | -1 | -1 | -1 |
++--------+--------+------------------+--------+----------+----------+
+Returned 2 row(s) in 0.01s
+[localhost:21000] > compute stats t1;
+Query: compute stats t1
++-----------------------------------------+
+| summary |
++-----------------------------------------+
+| Updated 1 partition(s) and 2 column(s). |
++-----------------------------------------+
+Returned 1 row(s) in 5.30s
+[localhost:21000] > show table stats t1;
+Query: show table stats t1
++-------+--------+------+--------+
+| #Rows | #Files | Size | Format |
++-------+--------+------+--------+
+| 3 | 1 | 33B | TEXT |
++-------+--------+------+--------+
+Returned 1 row(s) in 0.01s
+[localhost:21000] > show column stats t1;
+Query: show column stats t1
++--------+--------+------------------+--------+----------+----------+
+| Column | Type | #Distinct Values | #Nulls | Max Size | Avg Size |
++--------+--------+------------------+--------+----------+----------+
+| id | INT | 3 | -1 | 4 | 4 |
+| s | STRING | 3 | -1 | -1 | -1 |
++--------+--------+------------------+--------+----------+----------+
+Returned 2 row(s) in 0.02s
+[localhost:21000] > compute stats t2;
+Query: compute stats t2
++-----------------------------------------+
+| summary |
++-----------------------------------------+
+| Updated 1 partition(s) and 2 column(s). |
++-----------------------------------------+
+Returned 1 row(s) in 5.70s
+[localhost:21000] > show table stats t2;
+Query: show table stats t2
++-------+--------+----------+--------+
+| #Rows | #Files | Size | Format |
++-------+--------+----------+--------+
+| 98304 | 1 | 960.00KB | TEXT |
++-------+--------+----------+--------+
+Returned 1 row(s) in 0.03s
+[localhost:21000] > show column stats t2;
+Query: show column stats t2
++--------+--------+------------------+--------+----------+----------+
+| Column | Type | #Distinct Values | #Nulls | Max Size | Avg Size |
++--------+--------+------------------+--------+----------+----------+
+| parent | INT | 3 | -1 | 4 | 4 |
+| s | STRING | 6 | -1 | 14 | 9.3 |
++--------+--------+------------------+--------+----------+----------+
+Returned 2 row(s) in 0.01s</codeblock>
+
+ <p rev="2.1.0">
+ The following example shows how to use the <codeph>INCREMENTAL</codeph> clause, available in Impala 2.1.0 and
+ higher. The <codeph>COMPUTE INCREMENTAL STATS</codeph> syntax lets you collect statistics for newly added or
+ changed partitions, without rescanning the entire table.
+ </p>
+
+<codeblock>-- Initially the table has no incremental stats, as indicated
+-- by -1 under #Rows and false under Incremental stats.
+show table stats item_partitioned;
++-------------+-------+--------+----------+--------------+---------+------------------
+| i_category | #Rows | #Files | Size | Bytes Cached | Format | Incremental stats
++-------------+-------+--------+----------+--------------+---------+------------------
+| Books | -1 | 1 | 223.74KB | NOT CACHED | PARQUET | false
+| Children | -1 | 1 | 230.05KB | NOT CACHED | PARQUET | false
+| Electronics | -1 | 1 | 232.67KB | NOT CACHED | PARQUET | false
+| Home | -1 | 1 | 232.56KB | NOT CACHED | PARQUET | false
+| Jewelry | -1 | 1 | 223.72KB | NOT CACHED | PARQUET | false
+| Men | -1 | 1 | 231.25KB | NOT CACHED | PARQUET | false
+| Music | -1 | 1 | 237.90KB | NOT CACHED | PARQUET | false
+| Shoes | -1 | 1 | 234.90KB | NOT CACHED | PARQUET | false
+| Sports | -1 | 1 | 227.97KB | NOT CACHED | PARQUET | false
+| Women | -1 | 1 | 226.27KB | NOT CACHED | PARQUET | false
+| Total | -1 | 10 | 2.25MB | 0B | |
++-------------+-------+--------+----------+--------------+---------+------------------
+
+-- After the first COMPUTE INCREMENTAL STATS,
+-- all partitions have stats.
+compute incremental stats item_partitioned;
++-------------------------------------------+
+| summary |
++-------------------------------------------+
+| Updated 10 partition(s) and 21 column(s). |
++-------------------------------------------+
+show table stats item_partitioned;
++-------------+-------+--------+----------+--------------+---------+------------------
+| i_category | #Rows | #Files | Size | Bytes Cached | Format | Incremental stats
++-------------+-------+--------+----------+--------------+---------+------------------
+| Books | 1733 | 1 | 223.74KB | NOT CACHED | PARQUET | true
+| Children | 1786 | 1 | 230.05KB | NOT CACHED | PARQUET | true
+| Electronics | 1812 | 1 | 232.67KB | NOT CACHED | PARQUET | true
+| Home | 1807 | 1 | 232.56KB | NOT CACHED | PARQUET | true
+| Jewelry | 1740 | 1 | 223.72KB | NOT CACHED | PARQUET | true
+| Men | 1811 | 1 | 231.25KB | NOT CACHED | PARQUET | true
+| Music | 1860 | 1 | 237.90KB | NOT CACHED | PARQUET | true
+| Shoes | 1835 | 1 | 234.90KB | NOT CACHED | PARQUET | true
+| Sports | 1783 | 1 | 227.97KB | NOT CACHED | PARQUET | true
+| Women | 1790 | 1 | 226.27KB | NOT CACHED | PARQUET | true
+| Total | 17957 | 10 | 2.25MB | 0B | |
++-------------+-------+--------+----------+--------------+---------+------------------
+
+-- Add a new partition...
+alter table item_partitioned add partition (i_category='Camping');
+-- Add or replace files in HDFS outside of Impala,
+-- rendering the stats for a partition obsolete.
+!import_data_into_sports_partition.sh
+refresh item_partitioned;
+drop incremental stats item_partitioned partition (i_category='Sports');
+-- Now some partitions have incremental stats
+-- and some don't.
+show table stats item_partitioned;
++-------------+-------+--------+----------+--------------+---------+------------------
+| i_category | #Rows | #Files | Size | Bytes Cached | Format | Incremental stats
++-------------+-------+--------+----------+--------------+---------+------------------
+| Books | 1733 | 1 | 223.74KB | NOT CACHED | PARQUET | true
+| Camping | -1 | 1 | 408.02KB | NOT CACHED | PARQUET | false
+| Children | 1786 | 1 | 230.05KB | NOT CACHED | PARQUET | true
+| Electronics | 1812 | 1 | 232.67KB | NOT CACHED | PARQUET | true
+| Home | 1807 | 1 | 232.56KB | NOT CACHED | PARQUET | true
+| Jewelry | 1740 | 1 | 223.72KB | NOT CACHED | PARQUET | true
+| Men | 1811 | 1 | 231.25KB | NOT CACHED | PARQUET | true
+| Music | 1860 | 1 | 237.90KB | NOT CACHED | PARQUET | true
+| Shoes | 1835 | 1 | 234.90KB | NOT CACHED | PARQUET | true
+| Sports | -1 | 1 | 227.97KB | NOT CACHED | PARQUET | false
+| Women | 1790 | 1 | 226.27KB | NOT CACHED | PARQUET | true
+| Total | 17957 | 11 | 2.65MB | 0B | |
++-------------+-------+--------+----------+--------------+---------+------------------
+
+-- After another COMPUTE INCREMENTAL STATS,
+-- all partitions have incremental stats, and only the 2
+-- partitions without incremental stats were scanned.
+compute incremental stats item_partitioned;
++------------------------------------------+
+| summary |
++------------------------------------------+
+| Updated 2 partition(s) and 21 column(s). |
++------------------------------------------+
+show table stats item_partitioned;
++-------------+-------+--------+----------+--------------+---------+------------------
+| i_category | #Rows | #Files | Size | Bytes Cached | Format | Incremental stats
++-------------+-------+--------+----------+--------------+---------+------------------
+| Books | 1733 | 1 | 223.74KB | NOT CACHED | PARQUET | true
+| Camping | 5328 | 1 | 408.02KB | NOT CACHED | PARQUET | true
+| Children | 1786 | 1 | 230.05KB | NOT CACHED | PARQUET | true
+| Electronics | 1812 | 1 | 232.67KB | NOT CACHED | PARQUET | true
+| Home | 1807 | 1 | 232.56KB | NOT CACHED | PARQUET | true
+| Jewelry | 1740 | 1 | 223.72KB | NOT CACHED | PARQUET | true
+| Men | 1811 | 1 | 231.25KB | NOT CACHED | PARQUET | true
+| Music | 1860 | 1 | 237.90KB | NOT CACHED | PARQUET | true
+| Shoes | 1835 | 1 | 234.90KB | NOT CACHED | PARQUET | true
+| Sports | 1783 | 1 | 227.97KB | NOT CACHED | PARQUET | true
+| Women | 1790 | 1 | 226.27KB | NOT CACHED | PARQUET | true
+| Total | 17957 | 11 | 2.65MB | 0B | |
++-------------+-------+--------+----------+--------------+---------+------------------
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/file_format_blurb"/>
+
+ <p>
+ The <codeph>COMPUTE STATS</codeph> statement works with tables created with any of the file formats supported
+ by Impala. See <xref href="impala_file_formats.xml#file_formats"/> for details about working with the
+ different file formats. The following considerations apply to <codeph>COMPUTE STATS</codeph> depending on the
+ file format of the table.
+ </p>
+
+ <p>
+ The <codeph>COMPUTE STATS</codeph> statement works with text tables with no restrictions. These tables can be
+ created through either Impala or Hive.
+ </p>
+
+ <p>
+ The <codeph>COMPUTE STATS</codeph> statement works with Parquet tables. These tables can be created through
+ either Impala or Hive.
+ <note conref="../shared/impala_common.xml#common/compute_stats_parquet"/>
+ </p>
+
+ <p>
+ The <codeph>COMPUTE STATS</codeph> statement works with Avro tables, as long as they are created with
+ SQL-style column names and types rather than an Avro-style schema specification. These tables are currently
+ always created through Hive rather than Impala.
+ </p>
+
+ <p>
+ The <codeph>COMPUTE STATS</codeph> statement works with RCFile tables with no restrictions. These tables can
+ be created through either Impala or Hive.
+ </p>
+
+ <p>
+ The <codeph>COMPUTE STATS</codeph> statement works with SequenceFile tables with no restrictions. These
+ tables can be created through either Impala or Hive.
+ </p>
+
+ <p>
+ The <codeph>COMPUTE STATS</codeph> statement works with partitioned tables, whether all the partitions use
+ the same file format, or some partitions are defined through <codeph>ALTER TABLE</codeph> to use different
+ file formats.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/ddl_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_maybe"/>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/decimal_no_stats"/>
+
+ <note conref="../shared/impala_common.xml#common/compute_stats_nulls"/>
+
+ <p conref="../shared/impala_common.xml#common/internals_blurb"/>
+ <p>
+ Behind the scenes, the <codeph>COMPUTE STATS</codeph> statement
+ executes two statements: one to count the rows of each partition
+ in the table (or the entire table if unpartitioned) through the
+ <codeph>COUNT(*)</codeph> function,
+ and another to count the approximate number of distinct values
+ in each column through the <codeph>NDV()</codeph> function.
+ You might see these queries in your monitoring and diagnostic displays.
+ The same factors that affect the performance, scalability, and
+ execution of other queries (such as parallel execution, memory usage,
+ admission control, and timeouts) also apply to the queries run by the
+ <codeph>COMPUTE STATS</codeph> statement.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
+ <p rev="CDH-19187">
+ The user ID that the <cmdname>impalad</cmdname> daemon runs under,
+ typically the <codeph>impala</codeph> user, must have read
+ permission for all affected files in the source directory:
+ all files in the case of an unpartitioned table or
+ a partitioned table in the case of <codeph>COMPUTE STATS</codeph>;
+ or all the files in partitions without incremental stats in
+ the case of <codeph>COMPUTE INCREMENTAL STATS</codeph>.
+ It must also have read and execute permissions for all
+ relevant directories holding the data files.
+ (Essentially, <codeph>COMPUTE STATS</codeph> requires the
+ same permissions as the underlying <codeph>SELECT</codeph> queries it runs
+ against the table.)
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_drop_stats.xml#drop_stats"/>, <xref href="impala_show.xml#show_table_stats"/>,
+ <xref href="impala_show.xml#show_column_stats"/>, <xref href="impala_perf_stats.xml#perf_stats"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_conditional_functions.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_conditional_functions.xml b/docs/topics/impala_conditional_functions.xml
new file mode 100644
index 0000000..b922710
--- /dev/null
+++ b/docs/topics/impala_conditional_functions.xml
@@ -0,0 +1,443 @@
+<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="conditional_functions">
+
+ <title>Impala Conditional Functions</title>
+ <titlealts><navtitle>Conditional Functions</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ Impala supports the following conditional functions for testing equality, comparison operators, and nullity:
+ </p>
+
+ <dl>
+ <dlentry id="case">
+
+ <dt>
+ <codeph>CASE a WHEN b THEN c [WHEN d THEN e]... [ELSE f] END</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">CASE expression</indexterm>
+ <b>Purpose:</b> Compares an expression to one or more possible values, and returns a corresponding result
+ when a match is found.
+ <p conref="../shared/impala_common.xml#common/return_same_type"/>
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ In this form of the <codeph>CASE</codeph> expression, the initial value <codeph>A</codeph>
+ being evaluated for each row it typically a column reference, or an expression involving
+ a column. This form can only compare against a set of specified values, not ranges,
+ multi-value comparisons such as <codeph>BETWEEN</codeph> or <codeph>IN</codeph>,
+ regular expressions, or <codeph>NULL</codeph>.
+ </p>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p>
+ Although this example is split across multiple lines, you can put any or all parts of a <codeph>CASE</codeph> expression
+ on a single line, with no punctuation or other separators between the <codeph>WHEN</codeph>,
+ <codeph>ELSE</codeph>, and <codeph>END</codeph> clauses.
+ </p>
+<codeblock>select case x
+ when 1 then 'one'
+ when 2 then 'two'
+ when 0 then 'zero'
+ else 'out of range'
+ end
+ from t1;
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="case2">
+
+ <dt>
+ <codeph>CASE WHEN a THEN b [WHEN c THEN d]... [ELSE e] END</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">CASE expression</indexterm>
+ <b>Purpose:</b> Tests whether any of a sequence of expressions is true, and returns a corresponding
+ result for the first true expression.
+ <p conref="../shared/impala_common.xml#common/return_same_type"/>
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ <codeph>CASE</codeph> expressions without an initial test value have more flexibility.
+ For example, they can test different columns in different <codeph>WHEN</codeph> clauses,
+ or use comparison operators such as <codeph>BETWEEN</codeph>, <codeph>IN</codeph> and <codeph>IS NULL</codeph>
+ rather than comparing against discrete values.
+ </p>
+ <p>
+ <codeph>CASE</codeph> expressions are often the foundation of long queries that
+ summarize and format results for easy-to-read reports. For example, you might
+ use a <codeph>CASE</codeph> function call to turn values from a numeric column
+ into category strings corresponding to integer values, or labels such as <q>Small</q>,
+ <q>Medium</q> and <q>Large</q> based on ranges. Then subsequent parts of the
+ query might aggregate based on the transformed values, such as how many
+ values are classified as small, medium, or large. You can also use <codeph>CASE</codeph>
+ to signal problems with out-of-bounds values, <codeph>NULL</codeph> values,
+ and so on.
+ </p>
+ <p>
+ By using operators such as <codeph>OR</codeph>, <codeph>IN</codeph>,
+ <codeph>REGEXP</codeph>, and so on in <codeph>CASE</codeph> expressions,
+ you can build extensive tests and transformations into a single query.
+ Therefore, applications that construct SQL statements often rely heavily on <codeph>CASE</codeph>
+ calls in the generated SQL code.
+ </p>
+ <p>
+ Because this flexible form of the <codeph>CASE</codeph> expressions allows you to perform
+ many comparisons and call multiple functions when evaluating each row, be careful applying
+ elaborate <codeph>CASE</codeph> expressions to queries that process large amounts of data.
+ For example, when practical, evaluate and transform values through <codeph>CASE</codeph>
+ after applying operations such as aggregations that reduce the size of the result set;
+ transform numbers to strings after performing joins with the original numeric values.
+ </p>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p>
+ Although this example is split across multiple lines, you can put any or all parts of a <codeph>CASE</codeph> expression
+ on a single line, with no punctuation or other separators between the <codeph>WHEN</codeph>,
+ <codeph>ELSE</codeph>, and <codeph>END</codeph> clauses.
+ </p>
+<codeblock>select case
+ when dayname(now()) in ('Saturday','Sunday') then 'result undefined on weekends'
+ when x > y then 'x greater than y'
+ when x = y then 'x and y are equal'
+ when x is null or y is null then 'one of the columns is null'
+ else null
+ end
+ from t1;
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="coalesce">
+
+ <dt>
+ <codeph>coalesce(type v1, type v2, ...)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">coalesce() function</indexterm>
+ <b>Purpose:</b> Returns the first specified argument that is not <codeph>NULL</codeph>, or
+ <codeph>NULL</codeph> if all arguments are <codeph>NULL</codeph>.
+ <p conref="../shared/impala_common.xml#common/return_same_type"/>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="2.0.0" id="decode">
+
+ <dt>
+ <codeph>decode(type expression, type search1, type result1 [, type search2, type result2 ...] [, type
+ default] )</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">decode() function</indexterm>
+ <b>Purpose:</b> Compares an expression to one or more possible values, and returns a corresponding result
+ when a match is found.
+ <p conref="../shared/impala_common.xml#common/return_same_type"/>
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ Can be used as shorthand for a <codeph>CASE</codeph> expression.
+ </p>
+ <p>
+ The original expression and the search expressions must of the same type or convertible types. The
+ result expression can be a different type, but all result expressions must be of the same type.
+ </p>
+ <p>
+ Returns a successful match If the original expression is <codeph>NULL</codeph> and a search expression
+ is also <codeph>NULL</codeph>. the
+ </p>
+ <p>
+ Returns <codeph>NULL</codeph> if the final <codeph>default</codeph> value is omitted and none of the
+ search expressions match the original expression.
+ </p>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p>
+ The following example translates numeric day values into descriptive names:
+ </p>
+<codeblock>SELECT event, decode(day_of_week, 1, "Monday", 2, "Tuesday", 3, "Wednesday",
+ 4, "Thursday", 5, "Friday", 6, "Saturday", 7, "Sunday", "Unknown day")
+ FROM calendar;
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="if">
+
+ <dt>
+ <codeph>if(boolean condition, type ifTrue, type ifFalseOrNull)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">if() function</indexterm>
+ <b>Purpose:</b> Tests an expression and returns a corresponding result depending on whether the result is
+ true, false, or <codeph>NULL</codeph>.
+ <p>
+ <b>Return type:</b> Same as the <codeph>ifTrue</codeph> argument value
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="ifnull">
+
+ <dt>
+ <codeph>ifnull(type a, type ifNotNull)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">isnull() function</indexterm>
+ <b>Purpose:</b> Alias for the <codeph>isnull()</codeph> function, with the same behavior. To simplify
+ porting SQL with vendor extensions to Impala.
+ <p conref="../shared/impala_common.xml#common/added_in_130"/>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="isfalse" rev="2.2.0">
+
+ <dt>
+ <codeph>isfalse(<varname>boolean</varname>)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">isfalse() function</indexterm>
+ <b>Purpose:</b> Tests if a Boolean expression is <codeph>false</codeph> or not.
+ Returns <codeph>true</codeph> if so.
+ If the argument is <codeph>NULL</codeph>, returns <codeph>false</codeph>.
+ Identical to <codeph>isnottrue()</codeph>, except it returns the opposite value for a <codeph>NULL</codeph> argument.
+ <p conref="../shared/impala_common.xml#common/return_type_boolean"/>
+ <p conref="../shared/impala_common.xml#common/added_in_220"/>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="isnotfalse" rev="2.2.0">
+
+ <dt>
+ <codeph>isnotfalse(<varname>boolean</varname>)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">isnotfalse() function</indexterm>
+ <b>Purpose:</b> Tests if a Boolean expression is not <codeph>false</codeph> (that is, either <codeph>true</codeph> or <codeph>NULL</codeph>).
+ Returns <codeph>true</codeph> if so.
+ If the argument is <codeph>NULL</codeph>, returns <codeph>true</codeph>.
+ Identical to <codeph>istrue()</codeph>, except it returns the opposite value for a <codeph>NULL</codeph> argument.
+ <p conref="../shared/impala_common.xml#common/return_type_boolean"/>
+ <p conref="../shared/impala_common.xml#common/for_compatibility_only"/>
+ <p conref="../shared/impala_common.xml#common/added_in_220"/>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="isnottrue" rev="2.2.0">
+
+ <dt>
+ <codeph>isnottrue(<varname>boolean</varname>)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">isnottrue() function</indexterm>
+ <b>Purpose:</b> Tests if a Boolean expression is not <codeph>true</codeph> (that is, either <codeph>false</codeph> or <codeph>NULL</codeph>).
+ Returns <codeph>true</codeph> if so.
+ If the argument is <codeph>NULL</codeph>, returns <codeph>true</codeph>.
+ Identical to <codeph>isfalse()</codeph>, except it returns the opposite value for a <codeph>NULL</codeph> argument.
+ <p conref="../shared/impala_common.xml#common/return_type_boolean"/>
+ <p conref="../shared/impala_common.xml#common/added_in_220"/>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="isnull">
+
+ <dt>
+ <codeph>isnull(type a, type ifNotNull)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">isnull() function</indexterm>
+ <b>Purpose:</b> Tests if an expression is <codeph>NULL</codeph>, and returns the expression result value
+ if not. If the first argument is <codeph>NULL</codeph>, returns the second argument.
+ <p>
+ <b>Compatibility notes:</b> Equivalent to the <codeph>nvl()</codeph> function from Oracle Database or
+ <codeph>ifnull()</codeph> from MySQL. The <codeph>nvl()</codeph> and <codeph>ifnull()</codeph>
+ functions are also available in Impala.
+ </p>
+ <p>
+ <b>Return type:</b> Same as the first argument value
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="istrue" rev="2.2.0">
+
+ <dt>
+ <codeph>istrue(<varname>boolean</varname>)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">istrue() function</indexterm>
+ <b>Purpose:</b> Tests if a Boolean expression is <codeph>true</codeph> or not.
+ Returns <codeph>true</codeph> if so.
+ If the argument is <codeph>NULL</codeph>, returns <codeph>false</codeph>.
+ Identical to <codeph>isnotfalse()</codeph>, except it returns the opposite value for a <codeph>NULL</codeph> argument.
+ <p conref="../shared/impala_common.xml#common/return_type_boolean"/>
+ <p conref="../shared/impala_common.xml#common/for_compatibility_only"/>
+ <p conref="../shared/impala_common.xml#common/added_in_220"/>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="notnullvalue" rev="2.2.0">
+
+ <dt>
+ <codeph>notnullvalue(<varname>expression</varname>)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">function</indexterm>
+ <b>Purpose:</b> Tests if an expression (of any type) is <codeph>NULL</codeph> or not.
+ Returns <codeph>false</codeph> if so.
+ The converse of <codeph>nullvalue()</codeph>.
+ <p conref="../shared/impala_common.xml#common/return_type_boolean"/>
+ <p conref="../shared/impala_common.xml#common/for_compatibility_only"/>
+ <p conref="../shared/impala_common.xml#common/added_in_220"/>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="nullif">
+
+ <dt>
+ <codeph>nullif(<varname>expr1</varname>,<varname>expr2</varname>)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">nullif() function</indexterm>
+ <b>Purpose:</b> Returns <codeph>NULL</codeph> if the two specified arguments are equal. If the specified
+ arguments are not equal, returns the value of <varname>expr1</varname>. The data types of the expressions
+ must be compatible, according to the conversion rules from <xref href="impala_datatypes.xml#datatypes"/>.
+ You cannot use an expression that evaluates to <codeph>NULL</codeph> for <varname>expr1</varname>; that
+ way, you can distinguish a return value of <codeph>NULL</codeph> from an argument value of
+ <codeph>NULL</codeph>, which would never match <varname>expr2</varname>.
+ <p>
+ <b>Usage notes:</b> This function is effectively shorthand for a <codeph>CASE</codeph> expression of
+ the form:
+ </p>
+<codeblock>CASE
+ WHEN <varname>expr1</varname> = <varname>expr2</varname> THEN NULL
+ ELSE <varname>expr1</varname>
+END</codeblock>
+ <p>
+ It is commonly used in division expressions, to produce a <codeph>NULL</codeph> result instead of a
+ divide-by-zero error when the divisor is equal to zero:
+ </p>
+<codeblock>select 1.0 / nullif(c1,0) as reciprocal from t1;</codeblock>
+ <p>
+ You might also use it for compatibility with other database systems that support the same
+ <codeph>NULLIF()</codeph> function.
+ </p>
+ <p conref="../shared/impala_common.xml#common/return_same_type"/>
+ <p conref="../shared/impala_common.xml#common/added_in_130"/>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="nullifzero">
+
+ <dt>
+ <codeph>nullifzero(<varname>numeric_expr</varname>)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">nullifzero() function</indexterm>
+ <b>Purpose:</b> Returns <codeph>NULL</codeph> if the numeric expression evaluates to 0, otherwise returns
+ the result of the expression.
+ <p>
+ <b>Usage notes:</b> Used to avoid error conditions such as divide-by-zero in numeric calculations.
+ Serves as shorthand for a more elaborate <codeph>CASE</codeph> expression, to simplify porting SQL with
+ vendor extensions to Impala.
+ </p>
+ <p conref="../shared/impala_common.xml#common/return_same_type"/>
+ <p conref="../shared/impala_common.xml#common/added_in_130"/>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="nullvalue" rev="2.2.0">
+
+ <dt>
+ <codeph>nullvalue(<varname>expression</varname>)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">function</indexterm>
+ <b>Purpose:</b> Tests if an expression (of any type) is <codeph>NULL</codeph> or not.
+ Returns <codeph>true</codeph> if so.
+ The converse of <codeph>notnullvalue()</codeph>.
+ <p conref="../shared/impala_common.xml#common/return_type_boolean"/>
+ <p conref="../shared/impala_common.xml#common/for_compatibility_only"/>
+ <p conref="../shared/impala_common.xml#common/added_in_220"/>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="nvl" rev="1.1">
+
+ <dt>
+ <codeph>nvl(type a, type ifNotNull)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">nvl() function</indexterm>
+ <b>Purpose:</b> Alias for the <codeph>isnull()</codeph> function. Tests if an expression is
+ <codeph>NULL</codeph>, and returns the expression result value if not. If the first argument is
+ <codeph>NULL</codeph>, returns the second argument. Equivalent to the <codeph>nvl()</codeph> function
+ from Oracle Database or <codeph>ifnull()</codeph> from MySQL.
+ <p>
+ <b>Return type:</b> Same as the first argument value
+ </p>
+ <p conref="../shared/impala_common.xml#common/added_in_11"/>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="zeroifnull">
+
+ <dt>
+ <codeph>zeroifnull(<varname>numeric_expr</varname>)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">zeroifnull() function</indexterm>
+ <b>Purpose:</b> Returns 0 if the numeric expression evaluates to <codeph>NULL</codeph>, otherwise returns
+ the result of the expression.
+ <p>
+ <b>Usage notes:</b> Used to avoid unexpected results due to unexpected propagation of
+ <codeph>NULL</codeph> values in numeric calculations. Serves as shorthand for a more elaborate
+ <codeph>CASE</codeph> expression, to simplify porting SQL with vendor extensions to Impala.
+ </p>
+ <p conref="../shared/impala_common.xml#common/return_same_type"/>
+ <p conref="../shared/impala_common.xml#common/added_in_130"/>
+ </dd>
+
+ </dlentry>
+ </dl>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_conversion_functions.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_conversion_functions.xml b/docs/topics/impala_conversion_functions.xml
new file mode 100644
index 0000000..1050d0c
--- /dev/null
+++ b/docs/topics/impala_conversion_functions.xml
@@ -0,0 +1,758 @@
+<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="conversion_functions">
+
+ <title>Impala Type Conversion Functions</title>
+ <titlealts><navtitle>Type Conversion Functions</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="Impala Data Types"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ Conversion functions are usually used in combination with other functions, to explicitly pass the expected
+ data types. Impala has strict rules regarding data types for function parameters. For example, Impala does
+ not automatically convert a <codeph>DOUBLE</codeph> value to <codeph>FLOAT</codeph>, a
+ <codeph>BIGINT</codeph> value to <codeph>INT</codeph>, or other conversion where precision could be lost or
+ overflow could occur. Also, for reporting or dealing with loosely defined schemas in big data contexts,
+ you might frequently need to convert values to or from the <codeph>STRING</codeph> type.
+ </p>
+
+ <note>
+ Although in CDH 5.5.0, the <codeph>SHOW FUNCTIONS</codeph> output for
+ database <codeph>_IMPALA_BUILTINS</codeph> contains some function signatures
+ matching the pattern <codeph>castto*</codeph>, these functions are not intended
+ for public use and are expected to be hidden in future.
+ </note>
+
+ <p>
+ <b>Function reference:</b>
+ </p>
+
+ <p>
+ Impala supports the following type conversion functions:
+ </p>
+
+<dl>
+
+<dlentry id="cast">
+<dt>
+<codeph>cast(<varname>expr</varname> AS <varname>type</varname>)</codeph>
+</dt>
+
+<dd>
+<indexterm audience="Cloudera">cast() function</indexterm>
+<b>Purpose:</b> Converts the value of an expression to any other type.
+If the expression value is of a type that cannot be converted to the target type, the result is <codeph>NULL</codeph>.
+<p><b>Usage notes:</b>
+Use <codeph>CAST</codeph> when passing a column value or literal to a function that
+expects a parameter with a different type.
+Frequently used in SQL operations such as <codeph>CREATE TABLE AS SELECT</codeph>
+and <codeph>INSERT ... VALUES</codeph> to ensure that values from various sources
+are of the appropriate type for the destination columns.
+Where practical, do a one-time <codeph>CAST()</codeph> operation during the ingestion process
+to make each column into the appropriate type, rather than using many <codeph>CAST()</codeph>
+operations in each query; doing type conversions for each row during each query can be expensive
+for tables with millions or billions of rows.
+</p>
+ <p conref="../shared/impala_common.xml#common/timezone_conversion_caveat"/>
+
+<p conref="../shared/impala_common.xml#common/example_blurb"/>
+<codeblock>select concat('Here are the first ',10,' results.'); -- Fails
+select concat('Here are the first ',cast(10 as string),' results.'); -- Succeeds
+</codeblock>
+<p>
+The following example starts with a text table where every column has a type of <codeph>STRING</codeph>,
+which might be how you ingest data of unknown schema until you can verify the cleanliness of the underly values.
+Then it uses <codeph>CAST()</codeph> to create a new Parquet table with the same data, but using specific
+numeric data types for the columns with numeric data. Using numeric types of appropriate sizes can result in
+substantial space savings on disk and in memory, and performance improvements in queries,
+over using strings or larger-than-necessary numeric types.
+</p>
+<codeblock>create table t1 (name string, x string, y string, z string);
+
+create table t2 stored as parquet
+as select
+ name,
+ cast(x as bigint) x,
+ cast(y as timestamp) y,
+ cast(z as smallint) z
+from t1;
+
+describe t2;
++------+----------+---------+
+| name | type | comment |
++------+----------+---------+
+| name | string | |
+| x | bigint | |
+| y | smallint | |
+| z | tinyint | |
++------+----------+---------+
+</codeblock>
+<p conref="../shared/impala_common.xml#common/related_info"/>
+<p>
+<!-- TK: Can you cast to or from MAP, ARRAY, STRUCT? -->
+ For details of casts from each kind of data type, see the description of
+ the appropriate type:
+ <xref href="impala_tinyint.xml#tinyint"/>,
+ <xref href="impala_smallint.xml#smallint"/>,
+ <xref href="impala_int.xml#int"/>,
+ <xref href="impala_bigint.xml#bigint"/>,
+ <xref href="impala_float.xml#float"/>,
+ <xref href="impala_double.xml#double"/>,
+ <xref href="impala_decimal.xml#decimal"/>,
+ <xref href="impala_string.xml#string"/>,
+ <xref href="impala_char.xml#char"/>,
+ <xref href="impala_varchar.xml#varchar"/>,
+ <xref href="impala_timestamp.xml#timestamp"/>,
+ <xref href="impala_boolean.xml#boolean"/>
+</p>
+</dd>
+</dlentry>
+
+<dlentry rev="2.3.0" id="casttobigint" audience="Cloudera">
+<dt>
+<codeph>casttobigint(type value)</codeph>
+</dt>
+<dd>
+<indexterm audience="Cloudera">casttobigint() function</indexterm>
+<b>Purpose:</b> Converts the value of an expression to <codeph>BIGINT</codeph>. If the expression value is of a type that cannot be converted to the target type, the result is <codeph>NULL</codeph>.
+<p><b>Return type:</b> <codeph>bigint</codeph></p>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_usage"/>
+<p conref="../shared/impala_common.xml#common/added_in_230"/>
+<p conref="../shared/impala_common.xml#common/example_blurb"/>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_example"/>
+<codeblock>create table small_types (x tinyint, y smallint, z int);
+
+create table big_types as
+ select casttobigint(x) as x, casttobigint(y) as y, casttobigint(z) as z
+ from small_types;
+
+describe big_types;
++------+--------+---------+
+| name | type | comment |
++------+--------+---------+
+| x | bigint | |
+| y | bigint | |
+| z | bigint | |
++------+--------+---------+
+</codeblock>
+</dd>
+</dlentry>
+
+<dlentry rev="2.3.0" id="casttoboolean" audience="Cloudera">
+<dt>
+<codeph>casttoboolean(type value)</codeph>
+</dt>
+<dd>
+<indexterm audience="Cloudera">casttoboolean() function</indexterm>
+<b>Purpose:</b> Converts the value of an expression to <codeph>BOOLEAN</codeph>.
+Numeric values of 0 evaluate to <codeph>false</codeph>, and non-zero values evaluate to <codeph>true</codeph>.
+If the expression value is of a type that cannot be converted to the target type, the result is <codeph>NULL</codeph>.
+In particular, <codeph>STRING</codeph> values (even <codeph>'1'</codeph>, <codeph>'0'</codeph>, <codeph>'true'</codeph>
+or <codeph>'false'</codeph>) always return <codeph>NULL</codeph> when converted to <codeph>BOOLEAN</codeph>.
+<p><b>Return type:</b> <codeph>boolean</codeph></p>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_usage"/>
+<p conref="../shared/impala_common.xml#common/added_in_230"/>
+<p conref="../shared/impala_common.xml#common/example_blurb"/>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_example"/>
+<codeblock>select casttoboolean(0);
++------------------+
+| casttoboolean(0) |
++------------------+
+| false |
++------------------+
+
+select casttoboolean(1);
++------------------+
+| casttoboolean(1) |
++------------------+
+| true |
++------------------+
+
+select casttoboolean(99);
++-------------------+
+| casttoboolean(99) |
++-------------------+
+| true |
++-------------------+
+
+select casttoboolean(0.0);
++--------------------+
+| casttoboolean(0.0) |
++--------------------+
+| false |
++--------------------+
+
+select casttoboolean(0.5);
++--------------------+
+| casttoboolean(0.5) |
++--------------------+
+| true |
++--------------------+
+
+select casttoboolean('');
++-------------------+
+| casttoboolean('') |
++-------------------+
+| NULL |
++-------------------+
+
+select casttoboolean('yes');
++----------------------+
+| casttoboolean('yes') |
++----------------------+
+| NULL |
++----------------------+
+
+select casttoboolean('0');
++--------------------+
+| casttoboolean('0') |
++--------------------+
+| NULL |
++--------------------+
+
+select casttoboolean('true');
++-----------------------+
+| casttoboolean('true') |
++-----------------------+
+| NULL |
++-----------------------+
+
+select casttoboolean('false');
++------------------------+
+| casttoboolean('false') |
++------------------------+
+| NULL |
++------------------------+
+</codeblock>
+</dd>
+</dlentry>
+
+<dlentry rev="2.3.0" id="casttochar" audience="Cloudera">
+<dt>
+<codeph>casttochar(type value)</codeph>
+</dt>
+<dd>
+<indexterm audience="Cloudera">casttochar() function</indexterm>
+<b>Purpose:</b> Converts the value of an expression to <codeph>CHAR</codeph>. If the expression value is of a type that cannot be converted to the target type, the result is <codeph>NULL</codeph>.
+<p><b>Return type:</b> <codeph>char</codeph></p>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_usage"/>
+<p conref="../shared/impala_common.xml#common/added_in_230"/>
+<p conref="../shared/impala_common.xml#common/example_blurb"/>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_example"/>
+<codeblock>create table char_types as select casttochar('hello world') as c1, casttochar('xyz') as c2, casttochar('x') as c3;
++-------------------+
+| summary |
++-------------------+
+| Inserted 1 row(s) |
++-------------------+
+
+describe char_types;
++------+--------+---------+
+| name | type | comment |
++------+--------+---------+
+| c1 | string | |
+| c2 | string | |
+| c3 | string | |
++------+--------+---------+
+</codeblock>
+</dd>
+</dlentry>
+
+<dlentry rev="2.3.0" id="casttodecimal" audience="Cloudera">
+<dt>
+<codeph>casttodecimal(type value)</codeph>
+</dt>
+<dd>
+<indexterm audience="Cloudera">casttodecimal() function</indexterm>
+<b>Purpose:</b> Converts the value of an expression to <codeph>DECIMAL</codeph>. If the expression value is of a type that cannot be converted to the target type, the result is <codeph>NULL</codeph>.
+<p><b>Return type:</b> <codeph>decimal</codeph></p>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_usage"/>
+<p conref="../shared/impala_common.xml#common/added_in_230"/>
+<p conref="../shared/impala_common.xml#common/example_blurb"/>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_example"/>
+<codeblock>select casttodecimal(5.4);
++--------------------+
+| casttodecimal(5.4) |
++--------------------+
+| 5.4 |
++--------------------+
+</codeblock>
+</dd>
+</dlentry>
+
+<dlentry rev="2.3.0" id="casttodouble" audience="Cloudera">
+<dt>
+<codeph>casttodouble(type value)</codeph>
+</dt>
+<dd>
+<indexterm audience="Cloudera">casttodouble() function</indexterm>
+<b>Purpose:</b> Converts the value of an expression to <codeph>DOUBLE</codeph>. If the expression value is of a type that cannot be converted to the target type, the result is <codeph>NULL</codeph>.
+<p><b>Return type:</b> <codeph>double</codeph></p>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_usage"/>
+<p conref="../shared/impala_common.xml#common/added_in_230"/>
+<p conref="../shared/impala_common.xml#common/example_blurb"/>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_example"/>
+<codeblock>select casttodouble(5);
++-----------------+
+| casttodouble(5) |
++-----------------+
+| 5 |
++-----------------+
+
+select casttodouble('3.141');
++-----------------------+
+| casttodouble('3.141') |
++-----------------------+
+| 3.141 |
++-----------------------+
+
+select casttodouble(1e6);
++--------------------+
+| casttodouble(1e+6) |
++--------------------+
+| 1000000 |
++--------------------+
+
+select casttodouble(true);
++--------------------+
+| casttodouble(true) |
++--------------------+
+| 1 |
++--------------------+
+
+select casttodouble(now());
++---------------------+
+| casttodouble(now()) |
++---------------------+
+| 1447622306.031178 |
++---------------------+
+</codeblock>
+</dd>
+</dlentry>
+
+<dlentry rev="2.3.0" id="casttofloat" audience="Cloudera">
+<dt>
+<codeph>casttofloat(type value)</codeph>
+</dt>
+<dd>
+<indexterm audience="Cloudera">casttofloat() function</indexterm>
+<b>Purpose:</b> Converts the value of an expression to <codeph>FLOAT</codeph>. If the expression value is of a type that cannot be converted to the target type, the result is <codeph>NULL</codeph>.
+<p><b>Return type:</b> <codeph>float</codeph></p>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_usage"/>
+<p conref="../shared/impala_common.xml#common/added_in_230"/>
+<p conref="../shared/impala_common.xml#common/example_blurb"/>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_example"/>
+<codeblock>select casttofloat(5);
++----------------+
+| casttofloat(5) |
++----------------+
+| 5 |
++----------------+
+
+select casttofloat('3.141');
++----------------------+
+| casttofloat('3.141') |
++----------------------+
+| 3.141000032424927 |
++----------------------+
+
+select casttofloat(1e6);
++-------------------+
+| casttofloat(1e+6) |
++-------------------+
+| 1000000 |
++-------------------+
+
+select casttofloat(true);
++-------------------+
+| casttofloat(true) |
++-------------------+
+| 1 |
++-------------------+
+
+select casttofloat(now());
++--------------------+
+| casttofloat(now()) |
++--------------------+
+| 1447622400 |
++--------------------+
+</codeblock>
+</dd>
+</dlentry>
+
+<dlentry rev="2.3.0" id="casttoint" audience="Cloudera">
+<dt>
+<codeph>casttoint(type value)</codeph>
+</dt>
+<dd>
+<indexterm audience="Cloudera">casttoint() function</indexterm>
+<b>Purpose:</b> Converts the value of an expression to <codeph>INT</codeph>. If the expression value is of a type that cannot be converted to the target type, the result is <codeph>NULL</codeph>.
+<p><b>Return type:</b> <codeph>int</codeph></p>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_usage"/>
+<p conref="../shared/impala_common.xml#common/added_in_230"/>
+<p conref="../shared/impala_common.xml#common/example_blurb"/>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_example"/>
+<codeblock>select casttoint(5.4);
++----------------+
+| casttoint(5.4) |
++----------------+
+| 5 |
++----------------+
+
+select casttoint(true);
++-----------------+
+| casttoint(true) |
++-----------------+
+| 1 |
++-----------------+
+
+select casttoint(now());
++------------------+
+| casttoint(now()) |
++------------------+
+| 1447622487 |
++------------------+
+
+select casttoint('3.141');
++--------------------+
+| casttoint('3.141') |
++--------------------+
+| NULL |
++--------------------+
+
+select casttoint('3');
++----------------+
+| casttoint('3') |
++----------------+
+| 3 |
++----------------+
+</codeblock>
+</dd>
+</dlentry>
+
+<dlentry rev="2.3.0" id="casttosmallint" audience="Cloudera">
+<dt>
+<codeph>casttosmallint(type value)</codeph>
+</dt>
+<dd>
+<indexterm audience="Cloudera">casttosmallint() function</indexterm>
+<b>Purpose:</b> Converts the value of an expression to <codeph>SMALLINT</codeph>. If the expression value is of a type that cannot be converted to the target type, the result is <codeph>NULL</codeph>.
+<p><b>Return type:</b> <codeph>smallint</codeph></p>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_usage"/>
+<p conref="../shared/impala_common.xml#common/added_in_230"/>
+<p conref="../shared/impala_common.xml#common/example_blurb"/>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_example"/>
+<codeblock>create table big_types (x bigint, y int, z smallint);
+
+create table small_types as
+ select casttosmallint(x) as x, casttosmallint(y) as y, casttosmallint(z) as z
+ from big_types;
+
+describe small_types;
++------+----------+---------+
+| name | type | comment |
++------+----------+---------+
+| x | smallint | |
+| y | smallint | |
+| z | smallint | |
++------+----------+---------+
+</codeblock>
+</dd>
+</dlentry>
+
+<dlentry rev="2.3.0" id="casttostring" audience="Cloudera">
+<dt>
+<codeph>casttostring(type value)</codeph>
+</dt>
+<dd>
+<indexterm audience="Cloudera">casttostring() function</indexterm>
+<b>Purpose:</b> Converts the value of an expression to <codeph>STRING</codeph>. If the expression value is of a type that cannot be converted to the target type, the result is <codeph>NULL</codeph>.
+<p><b>Return type:</b> <codeph>string</codeph></p>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_usage"/>
+<p conref="../shared/impala_common.xml#common/added_in_230"/>
+<p conref="../shared/impala_common.xml#common/example_blurb"/>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_example"/>
+<codeblock>create table numeric_types (x int, y bigint, z tinyint);
+
+create table string_types as
+ select casttostring(x) as x, casttostring(y) as y, casttostring(z) as z
+ from numeric_types;
+
+describe string_types;
++------+--------+---------+
+| name | type | comment |
++------+--------+---------+
+| x | string | |
+| y | string | |
+| z | string | |
++------+--------+---------+
+</codeblock>
+</dd>
+</dlentry>
+
+<dlentry rev="2.3.0" id="casttotimestamp" audience="Cloudera">
+<dt>
+<codeph>casttotimestamp(type value)</codeph>
+</dt>
+<dd>
+<indexterm audience="Cloudera">casttotimestamp() function</indexterm>
+<b>Purpose:</b> Converts the value of an expression to <codeph>TIMESTAMP</codeph>. If the expression value is of a type that cannot be converted to the target type, the result is <codeph>NULL</codeph>.
+<p><b>Return type:</b> <codeph>timestamp</codeph></p>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_usage"/>
+<p conref="../shared/impala_common.xml#common/added_in_230"/>
+<p conref="../shared/impala_common.xml#common/example_blurb"/>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_example"/>
+<codeblock>select casttotimestamp(1000);
++-----------------------+
+| casttotimestamp(1000) |
++-----------------------+
+| 1970-01-01 00:16:40 |
++-----------------------+
+
+select casttotimestamp(1000.0);
++-------------------------+
+| casttotimestamp(1000.0) |
++-------------------------+
+| 1970-01-01 00:16:40 |
++-------------------------+
+
+select casttotimestamp('1000');
++-------------------------+
+| casttotimestamp('1000') |
++-------------------------+
+| NULL |
++-------------------------+
+</codeblock>
+</dd>
+</dlentry>
+
+<dlentry rev="2.3.0" id="casttotinyint" audience="Cloudera">
+<dt>
+<codeph>casttotinyint(type value)</codeph>
+</dt>
+<dd>
+<indexterm audience="Cloudera">casttotinyint() function</indexterm>
+<b>Purpose:</b> Converts the value of an expression to <codeph>TINYINT</codeph>. If the expression value is of a type that cannot be converted to the target type, the result is <codeph>NULL</codeph>.
+<p><b>Return type:</b> <codeph>tinyint</codeph></p>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_usage"/>
+<p conref="../shared/impala_common.xml#common/added_in_230"/>
+<p conref="../shared/impala_common.xml#common/example_blurb"/>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_example"/>
+<codeblock>create table big_types (x bigint, y int, z smallint);
+
+create table tiny_types as
+ select casttotinyint(x) as x, casttotinyint(y) as y, casttotinyint(z) as z
+ from big_types;
+
+describe tiny_types;
++------+---------+---------+
+| name | type | comment |
++------+---------+---------+
+| x | tinyint | |
+| y | tinyint | |
+| z | tinyint | |
++------+---------+---------+
+</codeblock>
+</dd>
+</dlentry>
+
+<dlentry rev="2.3.0" id="casttovarchar" audience="Cloudera">
+<dt>
+<codeph>casttovarchar(type value)</codeph>
+</dt>
+<dd>
+<indexterm audience="Cloudera">casttovarchar() function</indexterm>
+<b>Purpose:</b> Converts the value of an expression to <codeph>VARCHAR</codeph>. If the expression value is of a type that cannot be converted to the target type, the result is <codeph>NULL</codeph>.
+<p><b>Return type:</b> <codeph>varchar</codeph></p>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_usage"/>
+<p conref="../shared/impala_common.xml#common/added_in_230"/>
+<p conref="../shared/impala_common.xml#common/example_blurb"/>
+<p conref="../shared/impala_common.xml#common/cast_convenience_fn_example"/>
+<codeblock>select casttovarchar('abcd');
++-----------------------+
+| casttovarchar('abcd') |
++-----------------------+
+| abcd |
++-----------------------+
+
+select casttovarchar(999);
++--------------------+
+| casttovarchar(999) |
++--------------------+
+| 999 |
++--------------------+
+
+select casttovarchar(999.5);
++----------------------+
+| casttovarchar(999.5) |
++----------------------+
+| 999.5 |
++----------------------+
+
+select casttovarchar(now());
++-------------------------------+
+| casttovarchar(now()) |
++-------------------------------+
+| 2015-11-15 21:26:13.528073000 |
++-------------------------------+
+
+select casttovarchar(true);
++---------------------+
+| casttovarchar(true) |
++---------------------+
+| 1 |
++---------------------+
+</codeblock>
+</dd>
+</dlentry>
+
+<dlentry rev="2.3.0" id="typeof">
+<dt>
+<codeph>typeof(type value)</codeph>
+</dt>
+<dd>
+<indexterm audience="Cloudera">typeof() function</indexterm>
+<b>Purpose:</b> Returns the name of the data type corresponding to an expression. For types with
+extra attributes, such as length for <codeph>CHAR</codeph> and <codeph>VARCHAR</codeph>,
+or precision and scale for <codeph>DECIMAL</codeph>, includes the full specification of the type.
+<!-- To do: How about for columns of complex types? Or fields within complex types? -->
+<p><b>Return type:</b> <codeph>string</codeph></p>
+<p><b>Usage notes:</b> Typically used in interactive exploration of a schema, or in application code that programmatically generates schema definitions such as <codeph>CREATE TABLE</codeph> statements.
+For example, previously, to understand the type of an expression such as
+<codeph>col1 / col2</codeph> or <codeph>concat(col1, col2, col3)</codeph>,
+you might have created a dummy table with a single row, using syntax such as <codeph>CREATE TABLE foo AS SELECT 5 / 3.0</codeph>,
+and then doing a <codeph>DESCRIBE</codeph> to see the type of the row.
+Or you might have done a <codeph>CREATE TABLE AS SELECT</codeph> operation to create a table and
+copy data into it, only learning the types of the columns by doing a <codeph>DESCRIBE</codeph> afterward.
+This technique is especially useful for arithmetic expressions involving <codeph>DECIMAL</codeph> types,
+because the precision and scale of the result is typically different than that of the operands.
+</p>
+<p conref="../shared/impala_common.xml#common/added_in_230"/>
+<p conref="../shared/impala_common.xml#common/example_blurb"/>
+<p>
+These examples show how to check the type of a simple literal or function value.
+Notice how adding even tiny integers together changes the data type of the result to
+avoid overflow, and how the results of arithmetic operations on <codeph>DECIMAL</codeph> values
+have specific precision and scale attributes.
+</p>
+<codeblock>select typeof(2)
++-----------+
+| typeof(2) |
++-----------+
+| TINYINT |
++-----------+
+
+select typeof(2+2)
++---------------+
+| typeof(2 + 2) |
++---------------+
+| SMALLINT |
++---------------+
+
+select typeof('xyz')
++---------------+
+| typeof('xyz') |
++---------------+
+| STRING |
++---------------+
+
+select typeof(now())
++---------------+
+| typeof(now()) |
++---------------+
+| TIMESTAMP |
++---------------+
+
+select typeof(5.3 / 2.1)
++-------------------+
+| typeof(5.3 / 2.1) |
++-------------------+
+| DECIMAL(6,4) |
++-------------------+
+
+select typeof(5.30001 / 2342.1);
++--------------------------+
+| typeof(5.30001 / 2342.1) |
++--------------------------+
+| DECIMAL(13,11) |
++--------------------------+
+
+select typeof(typeof(2+2))
++-----------------------+
+| typeof(typeof(2 + 2)) |
++-----------------------+
+| STRING |
++-----------------------+
+</codeblock>
+
+<p>
+This example shows how even if you do not have a record of the type of a column,
+for example because the type was changed by <codeph>ALTER TABLE</codeph> after the
+original <codeph>CREATE TABLE</codeph>, you can still find out the type in a
+more compact form than examining the full <codeph>DESCRIBE</codeph> output.
+Remember to use <codeph>LIMIT 1</codeph> in such cases, to avoid an identical
+result value for every row in the table.
+</p>
+<codeblock>create table typeof_example (a int, b tinyint, c smallint, d bigint);
+
+/* Empty result set if there is no data in the table. */
+select typeof(a) from typeof_example;
+
+/* OK, now we have some data but the type of column A is being changed. */
+insert into typeof_example values (1, 2, 3, 4);
+alter table typeof_example change a a bigint;
+
+/* We can always find out the current type of that column without doing a full DESCRIBE. */
+select typeof(a) from typeof_example limit 1;
++-----------+
+| typeof(a) |
++-----------+
+| BIGINT |
++-----------+
+</codeblock>
+<p>
+This example shows how you might programmatically generate a <codeph>CREATE TABLE</codeph> statement
+with the appropriate column definitions to hold the result values of arbitrary expressions.
+The <codeph>typeof()</codeph> function lets you construct a detailed <codeph>CREATE TABLE</codeph> statement
+without actually creating the table, as opposed to <codeph>CREATE TABLE AS SELECT</codeph> operations
+where you create the destination table but only learn the column data types afterward through <codeph>DESCRIBE</codeph>.
+</p>
+<codeblock>describe typeof_example;
++------+----------+---------+
+| name | type | comment |
++------+----------+---------+
+| a | bigint | |
+| b | tinyint | |
+| c | smallint | |
+| d | bigint | |
++------+----------+---------+
+
+/* An ETL or business intelligence tool might create variations on a table with different file formats,
+ different sets of columns, and so on. TYPEOF() lets an application introspect the types of the original columns. */
+select concat('create table derived_table (a ', typeof(a), ', b ', typeof(b), ', c ',
+ typeof(c), ', d ', typeof(d), ') stored as parquet;')
+ as 'create table statement'
+from typeof_example limit 1;
++-------------------------------------------------------------------------------------------+
+| create table statement |
++-------------------------------------------------------------------------------------------+
+| create table derived_table (a BIGINT, b TINYINT, c SMALLINT, d BIGINT) stored as parquet; |
++-------------------------------------------------------------------------------------------+
+</codeblock>
+</dd>
+</dlentry>
+
+</dl>
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_count.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_count.xml b/docs/topics/impala_count.xml
new file mode 100644
index 0000000..2f3f519
--- /dev/null
+++ b/docs/topics/impala_count.xml
@@ -0,0 +1,230 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="count">
+
+ <title>COUNT Function</title>
+ <titlealts><navtitle>COUNT</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="Analytic Functions"/>
+ <data name="Category" value="Aggregate Functions"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">count() function</indexterm>
+ An aggregate function that returns the number of rows, or the number of non-<codeph>NULL</codeph> rows.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>COUNT([DISTINCT | ALL] <varname>expression</varname>) [OVER (<varname>analytic_clause</varname>)]</codeblock>
+
+ <p>
+ Depending on the argument, <codeph>COUNT()</codeph> considers rows that meet certain conditions:
+ </p>
+
+ <ul>
+ <li>
+ The notation <codeph>COUNT(*)</codeph> includes <codeph>NULL</codeph> values in the total.
+ </li>
+
+ <li>
+ The notation <codeph>COUNT(<varname>column_name</varname>)</codeph> only considers rows where the column
+ contains a non-<codeph>NULL</codeph> value.
+ </li>
+
+ <li>
+ You can also combine <codeph>COUNT</codeph> with the <codeph>DISTINCT</codeph> operator to eliminate
+ duplicates before counting, and to count the combinations of values across multiple columns.
+ </li>
+ </ul>
+
+ <p>
+ When the query contains a <codeph>GROUP BY</codeph> clause, returns one value for each combination of
+ grouping values.
+ </p>
+
+ <p>
+ <b>Return type:</b> <codeph>BIGINT</codeph>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_aggregation_explanation"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_aggregation_example"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>-- How many rows total are in the table, regardless of NULL values?
+select count(*) from t1;
+-- How many rows are in the table with non-NULL values for a column?
+select count(c1) from t1;
+-- Count the rows that meet certain conditions.
+-- Again, * includes NULLs, so COUNT(*) might be greater than COUNT(col).
+select count(*) from t1 where x > 10;
+select count(c1) from t1 where x > 10;
+-- Can also be used in combination with DISTINCT and/or GROUP BY.
+-- Combine COUNT and DISTINCT to find the number of unique values.
+-- Must use column names rather than * with COUNT(DISTINCT ...) syntax.
+-- Rows with NULL values are not counted.
+select count(distinct c1) from t1;
+-- Rows with a NULL value in _either_ column are not counted.
+select count(distinct c1, c2) from t1;
+-- Return more than one result.
+select month, year, count(distinct visitor_id) from web_stats group by month, year;
+</codeblock>
+
+ <p rev="2.0.0">
+ The following examples show how to use <codeph>COUNT()</codeph> in an analytic context. They use a table
+ containing integers from 1 to 10. Notice how the <codeph>COUNT()</codeph> is reported for each input value, as
+ opposed to the <codeph>GROUP BY</codeph> clause which condenses the result set.
+<codeblock>select x, property, count(x) over (partition by property) as count from int_t where property in ('odd','even');
++----+----------+-------+
+| x | property | count |
++----+----------+-------+
+| 2 | even | 5 |
+| 4 | even | 5 |
+| 6 | even | 5 |
+| 8 | even | 5 |
+| 10 | even | 5 |
+| 1 | odd | 5 |
+| 3 | odd | 5 |
+| 5 | odd | 5 |
+| 7 | odd | 5 |
+| 9 | odd | 5 |
++----+----------+-------+
+</codeblock>
+
+Adding an <codeph>ORDER BY</codeph> clause lets you experiment with results that are cumulative or apply to a moving
+set of rows (the <q>window</q>). The following examples use <codeph>COUNT()</codeph> in an analytic context
+(that is, with an <codeph>OVER()</codeph> clause) to produce a running count of all the even values,
+then a running count of all the odd values. The basic <codeph>ORDER BY x</codeph> clause implicitly
+activates a window clause of <codeph>RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW</codeph>,
+which is effectively the same as <codeph>ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW</codeph>,
+therefore all of these examples produce the same results:
+<codeblock>select x, property,
+ count(x) over (partition by property <b>order by x</b>) as 'cumulative count'
+ from int_t where property in ('odd','even');
++----+----------+------------------+
+| x | property | cumulative count |
++----+----------+------------------+
+| 2 | even | 1 |
+| 4 | even | 2 |
+| 6 | even | 3 |
+| 8 | even | 4 |
+| 10 | even | 5 |
+| 1 | odd | 1 |
+| 3 | odd | 2 |
+| 5 | odd | 3 |
+| 7 | odd | 4 |
+| 9 | odd | 5 |
++----+----------+------------------+
+
+select x, property,
+ count(x) over
+ (
+ partition by property
+ <b>order by x</b>
+ <b>range between unbounded preceding and current row</b>
+ ) as 'cumulative total'
+from int_t where property in ('odd','even');
++----+----------+------------------+
+| x | property | cumulative count |
++----+----------+------------------+
+| 2 | even | 1 |
+| 4 | even | 2 |
+| 6 | even | 3 |
+| 8 | even | 4 |
+| 10 | even | 5 |
+| 1 | odd | 1 |
+| 3 | odd | 2 |
+| 5 | odd | 3 |
+| 7 | odd | 4 |
+| 9 | odd | 5 |
++----+----------+------------------+
+
+select x, property,
+ count(x) over
+ (
+ partition by property
+ <b>order by x</b>
+ <b>rows between unbounded preceding and current row</b>
+ ) as 'cumulative total'
+ from int_t where property in ('odd','even');
++----+----------+------------------+
+| x | property | cumulative count |
++----+----------+------------------+
+| 2 | even | 1 |
+| 4 | even | 2 |
+| 6 | even | 3 |
+| 8 | even | 4 |
+| 10 | even | 5 |
+| 1 | odd | 1 |
+| 3 | odd | 2 |
+| 5 | odd | 3 |
+| 7 | odd | 4 |
+| 9 | odd | 5 |
++----+----------+------------------+
+</codeblock>
+
+The following examples show how to construct a moving window, with a running count taking into account 1 row before
+and 1 row after the current row, within the same partition (all the even values or all the odd values).
+Therefore, the count is consistently 3 for rows in the middle of the window, and 2 for
+rows near the ends of the window, where there is no preceding or no following row in the partition.
+Because of a restriction in the Impala <codeph>RANGE</codeph> syntax, this type of
+moving window is possible with the <codeph>ROWS BETWEEN</codeph> clause but not the <codeph>RANGE BETWEEN</codeph>
+clause:
+<codeblock>select x, property,
+ count(x) over
+ (
+ partition by property
+ <b>order by x</b>
+ <b>rows between 1 preceding and 1 following</b>
+ ) as 'moving total'
+ from int_t where property in ('odd','even');
++----+----------+--------------+
+| x | property | moving total |
++----+----------+--------------+
+| 2 | even | 2 |
+| 4 | even | 3 |
+| 6 | even | 3 |
+| 8 | even | 3 |
+| 10 | even | 2 |
+| 1 | odd | 2 |
+| 3 | odd | 3 |
+| 5 | odd | 3 |
+| 7 | odd | 3 |
+| 9 | odd | 2 |
++----+----------+--------------+
+
+-- Doesn't work because of syntax restriction on RANGE clause.
+select x, property,
+ count(x) over
+ (
+ partition by property
+ <b>order by x</b>
+ <b>range between 1 preceding and 1 following</b>
+ ) as 'moving total'
+from int_t where property in ('odd','even');
+ERROR: AnalysisException: RANGE is only supported with both the lower and upper bounds UNBOUNDED or one UNBOUNDED and the other CURRENT ROW.
+</codeblock>
+ </p>
+
+ <note conref="../shared/impala_common.xml#common/multiple_count_distinct"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_analytic_functions.xml#analytic_functions"/>
+ </p>
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_create_database.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_create_database.xml b/docs/topics/impala_create_database.xml
new file mode 100644
index 0000000..f4153e0
--- /dev/null
+++ b/docs/topics/impala_create_database.xml
@@ -0,0 +1,115 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="create_database">
+
+ <title>CREATE DATABASE Statement</title>
+ <titlealts><navtitle>CREATE DATABASE</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Databases"/>
+ <data name="Category" value="Schemas"/>
+ <data name="Category" value="DDL"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">CREATE DATABASE statement</indexterm>
+ Creates a new database.
+ </p>
+
+ <p>
+ In Impala, a database is both:
+ </p>
+
+ <ul>
+ <li>
+ A logical construct for grouping together related tables, views, and functions within their own namespace.
+ You might use a separate database for each application, set of related tables, or round of experimentation.
+ </li>
+
+ <li>
+ A physical construct represented by a directory tree in HDFS. Tables (internal tables), partitions, and
+ data files are all located under this directory. You can perform HDFS-level operations such as backing it up and measuring space usage,
+ or remove it with a <codeph>DROP DATABASE</codeph> statement.
+ </li>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] <varname>database_name</varname>[COMMENT '<varname>database_comment</varname>']
+ [LOCATION <varname>hdfs_path</varname>];</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/ddl_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ A database is physically represented as a directory in HDFS, with a filename extension <codeph>.db</codeph>,
+ under the main Impala data directory. If the associated HDFS directory does not exist, it is created for you.
+ All databases and their associated directories are top-level objects, with no physical or logical nesting.
+ </p>
+
+ <p>
+ After creating a database, to make it the current database within an <cmdname>impala-shell</cmdname> session,
+ use the <codeph>USE</codeph> statement. You can refer to tables in the current database without prepending
+ any qualifier to their names.
+ </p>
+
+ <p>
+ When you first connect to Impala through <cmdname>impala-shell</cmdname>, the database you start in (before
+ issuing any <codeph>CREATE DATABASE</codeph> or <codeph>USE</codeph> statements) is named
+ <codeph>default</codeph>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/builtins_db"/>
+
+ <p>
+ After creating a database, your <cmdname>impala-shell</cmdname> session or another
+ <cmdname>impala-shell</cmdname> connected to the same node can immediately access that database. To access
+ the database through the Impala daemon on a different node, issue the <codeph>INVALIDATE METADATA</codeph>
+ statement first while connected to that other node.
+ </p>
+
+ <p>
+ Setting the <codeph>LOCATION</codeph> attribute for a new database is a way to work with sets of files in an
+ HDFS directory structure outside the default Impala data directory, as opposed to setting the
+ <codeph>LOCATION</codeph> attribute for each individual table.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/hive_blurb"/>
+
+ <p>
+ When you create a database in Impala, the database can also be used by Hive.
+ When you create a database in Hive, issue an <codeph>INVALIDATE METADATA</codeph>
+ statement in Impala to make Impala permanently aware of the new database.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
+ <p rev="CDH-19187">
+ The user ID that the <cmdname>impalad</cmdname> daemon runs under,
+ typically the <codeph>impala</codeph> user, must have write
+ permission for the parent HDFS directory under which the database
+ is located.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <codeblock conref="../shared/impala_common.xml#common/create_drop_db_example"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_databases.xml#databases"/>, <xref href="impala_drop_database.xml#drop_database"/>,
+ <xref href="impala_use.xml#use"/>, <xref href="impala_show.xml#show_databases"/>,
+ <xref href="impala_tables.xml#tables"/>
+ </p>
+ </conbody>
+</concept>
[09/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Posted by jr...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_live_summary.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_live_summary.xml b/docs/topics/impala_live_summary.xml
new file mode 100644
index 0000000..bfe71bf
--- /dev/null
+++ b/docs/topics/impala_live_summary.xml
@@ -0,0 +1,207 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="2.3.0" id="live_summary">
+
+ <title>LIVE_SUMMARY Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Performance"/>
+ <data name="Category" value="Reports"/>
+ <data name="Category" value="impala-shell"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">LIVE_SUMMARY query option</indexterm>
+ For queries submitted through the <cmdname>impala-shell</cmdname> command,
+ displays the same output as the <codeph>SUMMARY</codeph> command,
+ with the measurements updated in real time as the query progresses.
+ When the query finishes, the final <codeph>SUMMARY</codeph> output remains
+ visible in the <cmdname>impala-shell</cmdname> console output.
+ </p>
+
+ <p>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/type_boolean"/>
+ <p conref="../shared/impala_common.xml#common/default_false_0"/>
+
+ <p conref="../shared/impala_common.xml#common/command_line_blurb"/>
+ <p>
+ You can enable this query option within <cmdname>impala-shell</cmdname>
+ by starting the shell with the <codeph>--live_summary</codeph>
+ command-line option.
+ You can still turn this setting off and on again within the shell through the
+ <codeph>SET</codeph> command.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ The live summary output can be useful for evaluating long-running queries,
+ to evaluate which phase of execution takes up the most time, or if some hosts
+ take much longer than others for certain operations, dragging overall performance down.
+ By making the information available in real time, this feature lets you decide what
+ action to take even before you cancel a query that is taking much longer than normal.
+ </p>
+ <p>
+ For example, you might see the HDFS scan phase taking a long time, and therefore revisit
+ performance-related aspects of your schema design such as constructing a partitioned table,
+ switching to the Parquet file format, running the <codeph>COMPUTE STATS</codeph> statement
+ for the table, and so on.
+ Or you might see a wide variation between the average and maximum times for all hosts to
+ perform some phase of the query, and therefore investigate if one particular host
+ needed more memory or was experiencing a network problem.
+ </p>
+ <p conref="../shared/impala_common.xml#common/live_reporting_details"/>
+ <p>
+ For a simple and concise way of tracking the progress of an interactive query, see
+ <xref href="impala_live_progress.xml#live_progress"/>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+ <p conref="../shared/impala_common.xml#common/impala_shell_progress_reports_compute_stats_caveat"/>
+ <p conref="../shared/impala_common.xml#common/impala_shell_progress_reports_shell_only_caveat"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example shows a series of <codeph>LIVE_SUMMARY</codeph> reports that
+ are displayed during the course of a query, showing how the numbers increase to
+ show the progress of different phases of the distributed query. When you do the same
+ in <cmdname>impala-shell</cmdname>, only a single report is displayed at any one time,
+ with each update overwriting the previous numbers.
+ </p>
+
+<codeblock><![CDATA[[localhost:21000] > set live_summary=true;
+LIVE_SUMMARY set to true
+[localhost:21000] > select count(*) from customer t1 cross join customer t2;
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+| Operator | #Hosts | Avg Time | Max Time | #Rows | Est. #Rows | Peak Mem | Est. Peak Mem | Detail |
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+| 06:AGGREGATE | 0 | 0ns | 0ns | 0 | 1 | 0 B | -1 B | FINALIZE |
+| 05:EXCHANGE | 0 | 0ns | 0ns | 0 | 1 | 0 B | -1 B | UNPARTITIONED |
+| 03:AGGREGATE | 0 | 0ns | 0ns | 0 | 1 | 0 B | 10.00 MB | |
+| 02:NESTED LOOP JOIN | 0 | 0ns | 0ns | 0 | 22.50B | 0 B | 0 B | CROSS JOIN, BROADCAST |
+| |--04:EXCHANGE | 0 | 0ns | 0ns | 0 | 150.00K | 0 B | 0 B | BROADCAST |
+| | 01:SCAN HDFS | 1 | 503.57ms | 503.57ms | 150.00K | 150.00K | 24.09 MB | 64.00 MB | tpch.customer t2 |
+| 00:SCAN HDFS | 0 | 0ns | 0ns | 0 | 150.00K | 0 B | 64.00 MB | tpch.customer t1 |
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+| Operator | #Hosts | Avg Time | Max Time | #Rows | Est. #Rows | Peak Mem | Est. Peak Mem | Detail |
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+| 06:AGGREGATE | 0 | 0ns | 0ns | 0 | 1 | 0 B | -1 B | FINALIZE |
+| 05:EXCHANGE | 0 | 0ns | 0ns | 0 | 1 | 0 B | -1 B | UNPARTITIONED |
+| 03:AGGREGATE | 1 | 0ns | 0ns | 0 | 1 | 20.00 KB | 10.00 MB | |
+| 02:NESTED LOOP JOIN | 1 | 17.62s | 17.62s | 81.14M | 22.50B | 3.23 MB | 0 B | CROSS JOIN, BROADCAST |
+| |--04:EXCHANGE | 1 | 26.29ms | 26.29ms | 150.00K | 150.00K | 0 B | 0 B | BROADCAST |
+| | 01:SCAN HDFS | 1 | 503.57ms | 503.57ms | 150.00K | 150.00K | 24.09 MB | 64.00 MB | tpch.customer t2 |
+| 00:SCAN HDFS | 1 | 247.53ms | 247.53ms | 1.02K | 150.00K | 24.39 MB | 64.00 MB | tpch.customer t1 |
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+| Operator | #Hosts | Avg Time | Max Time | #Rows | Est. #Rows | Peak Mem | Est. Peak Mem | Detail |
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+| 06:AGGREGATE | 0 | 0ns | 0ns | 0 | 1 | 0 B | -1 B | FINALIZE |
+| 05:EXCHANGE | 0 | 0ns | 0ns | 0 | 1 | 0 B | -1 B | UNPARTITIONED |
+| 03:AGGREGATE | 1 | 0ns | 0ns | 0 | 1 | 20.00 KB | 10.00 MB | |
+| 02:NESTED LOOP JOIN | 1 | 61.85s | 61.85s | 283.43M | 22.50B | 3.23 MB | 0 B | CROSS JOIN, BROADCAST |
+| |--04:EXCHANGE | 1 | 26.29ms | 26.29ms | 150.00K | 150.00K | 0 B | 0 B | BROADCAST |
+| | 01:SCAN HDFS | 1 | 503.57ms | 503.57ms | 150.00K | 150.00K | 24.09 MB | 64.00 MB | tpch.customer t2 |
+| 00:SCAN HDFS | 1 | 247.59ms | 247.59ms | 2.05K | 150.00K | 24.39 MB | 64.00 MB | tpch.customer t1 |
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+]]>
+</codeblock>
+
+<!-- Keeping this sample output that illustrates a couple of glitches in the LIVE_SUMMARY display, hidden, to help filing JIRAs. -->
+<codeblock audience="Cloudera"><![CDATA[[
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+| Operator | #Hosts | Avg Time | Max Time | #Rows | Est. #Rows | Peak Mem | Est. Peak Mem | Detail |
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+| 06:AGGREGATE | 0 | 0ns | 0ns | 0 | 1 | 0 B | -1 B | FINALIZE |
+| 05:EXCHANGE | 0 | 0ns | 0ns | 0 | 1 | 0 B | -1 B | UNPARTITIONED |
+| 03:AGGREGATE | 1 | 0ns | 0ns | 0 | 1 | 20.00 KB | 10.00 MB | |
+| 02:NESTED LOOP JOIN | 1 | 91.34s | 91.34s | 419.48M | 22.50B | 3.23 MB | 0 B | CROSS JOIN, BROADCAST |
+| |--04:EXCHANGE | 1 | 26.29ms | 26.29ms | 150.00K | 150.00K | 0 B | 0 B | BROADCAST |
+| | 01:SCAN HDFS | 1 | 503.57ms | 503.57ms | 150.00K | 150.00K | 24.09 MB | 64.00 MB | tpch.customer t2 |
+| 00:SCAN HDFS | 1 | 247.63ms | 247.63ms | 3.07K | 150.00K | 24.39 MB | 64.00 MB | tpch.customer t1 |
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+| Operator | #Hosts | Avg Time | Max Time | #Rows | Est. #Rows | Peak Mem | Est. Peak Mem | Detail |
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+| 06:AGGREGATE | 0 | 0ns | 0ns | 0 | 1 | 0 B | -1 B | FINALIZE |
+| 05:EXCHANGE | 0 | 0ns | 0ns | 0 | 1 | 0 B | -1 B | UNPARTITIONED |
+| 03:AGGREGATE | 1 | 0ns | 0ns | 0 | 1 | 20.00 KB | 10.00 MB | |
+| 02:NESTED LOOP JOIN | 1 | 140.49s | 140.49s | 646.82M | 22.50B | 3.23 MB | 0 B | CROSS JOIN, BROADCAST |
+| |--04:EXCHANGE | 1 | 26.29ms | 26.29ms | 150.00K | 150.00K | 0 B | 0 B | BROADCAST |
+| | 01:SCAN HDFS | 1 | 503.57ms | 503.57ms | 150.00K | 150.00K | 24.09 MB | 64.00 MB | tpch.customer t2 |
+| 00:SCAN HDFS | 1 | 247.73ms | 247.73ms | 5.12K | 150.00K | 24.39 MB | 64.00 MB | tpch.customer t1 |
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+| Operator | #Hosts | Avg Time | Max Time | #Rows | Est. #Rows | Peak Mem | Est. Peak Mem | Detail |
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+| 06:AGGREGATE | 0 | 0ns | 0ns | 0 | 1 | 0 B | -1 B | FINALIZE |
+| 05:EXCHANGE | 0 | 0ns | 0ns | 0 | 1 | 0 B | -1 B | UNPARTITIONED |
+| 03:AGGREGATE | 1 | 0ns | 0ns | 0 | 1 | 20.00 KB | 10.00 MB | |
+| 02:NESTED LOOP JOIN | 1 | 228.96s | 228.96s | 1.06B | 22.50B | 3.23 MB | 0 B | CROSS JOIN, BROADCAST |
+| |--04:EXCHANGE | 1 | 26.29ms | 26.29ms | 150.00K | 150.00K | 0 B | 0 B | BROADCAST |
+| | 01:SCAN HDFS | 1 | 503.57ms | 503.57ms | 150.00K | 150.00K | 24.09 MB | 64.00 MB | tpch.customer t2 |
+| 00:SCAN HDFS | 1 | 247.83ms | 247.83ms | 7.17K | 150.00K | 24.39 MB | 64.00 MB | tpch.customer t1 |
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+| Operator | #Hosts | Avg Time | Max Time | #Rows | Est. #Rows | Peak Mem | Est. Peak Mem | Detail |
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+| 06:AGGREGATE | 0 | 0ns | 0ns | 0 | 1 | 0 B | -1 B | FINALIZE |
+| 05:EXCHANGE | 0 | 0ns | 0ns | 0 | 1 | 0 B | -1 B | UNPARTITIONED |
+| 03:AGGREGATE | 1 | 0ns | 0ns | 0 | 1 | 20.00 KB | 10.00 MB | |
+| 02:NESTED LOOP JOIN | 1 | 563.11s | 563.11s | 2.59B | 22.50B | 3.23 MB | 0 B | CROSS JOIN, BROADCAST |
+| |--04:EXCHANGE | 1 | 26.29ms | 26.29ms | 150.00K | 150.00K | 0 B | 0 B | BROADCAST |
+| | 01:SCAN HDFS | 1 | 503.57ms | 503.57ms | 150.00K | 150.00K | 24.09 MB | 64.00 MB | tpch.customer t2 |
+| 00:SCAN HDFS | 1 | 248.11ms | 248.11ms | 17.41K | 150.00K | 24.39 MB | 64.00 MB | tpch.customer t1 |
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+| Operator | #Hosts | Avg Time | Max Time | #Rows | Est. #Rows | Peak Mem | Est. Peak Mem | Detail |
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+| 06:AGGREGATE | 0 | 0ns | 0ns | 0 | 1 | 0 B | -1 B | FINALIZE |
+| 05:EXCHANGE | 0 | 0ns | 0ns | 0 | 1 | 0 B | -1 B | UNPARTITIONED |
+| 03:AGGREGATE | 1 | 0ns | 0ns | 0 | 1 | 20.00 KB | 10.00 MB | |
+| 02:NESTED LOOP JOIN | 1 | 985.71s | 985.71s | 4.54B | 22.50B | 3.23 MB | 0 B | CROSS JOIN, BROADCAST |
+| |--04:EXCHANGE | 1 | 26.29ms | 26.29ms | 150.00K | 150.00K | 0 B | 0 B | BROADCAST |
+| | 01:SCAN HDFS | 1 | 503.57ms | 503.57ms | 150.00K | 150.00K | 24.09 MB | 64.00 MB | tpch.customer t2 |
+| 00:SCAN HDFS | 1 | 248.49ms | 248.49ms | 30.72K | 150.00K | 24.39 MB | 64.00 MB | tpch.customer t1 |
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+| Operator | #Hosts | Avg Time | Max Time | #Rows | Est. #Rows | Peak Mem | Est. Peak Mem | Detail |
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+| 06:AGGREGATE | 0 | 0ns | 0ns | 0 | 1 | 0 B | -1 B | FINALIZE |
+| 05:EXCHANGE | 0 | 0ns | 0ns | 0 | 1 | 0 B | -1 B | UNPARTITIONED |
+| 03:AGGREGATE | 1 | 0ns | 0ns | 0 | 1 | 20.00 KB | 10.00 MB | |
+| 02:NESTED LOOP JOIN | 1 | None | None | 5.42B | 22.50B | 3.23 MB | 0 B | CROSS JOIN, BROADCAST |
+| |--04:EXCHANGE | 1 | 26.29ms | 26.29ms | 150.00K | 150.00K | 0 B | 0 B | BROADCAST |
+| | 01:SCAN HDFS | 1 | 503.57ms | 503.57ms | 150.00K | 150.00K | 24.09 MB | 64.00 MB | tpch.customer t2 |
+| 00:SCAN HDFS | 1 | 248.66ms | 248.66ms | 36.86K | 150.00K | 24.39 MB | 64.00 MB | tpch.customer t1 |
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+
+[localhost:21000] > select count(*) from customer t1 cross join customer t2;
+Query: select count(*) from customer t1 cross join customer t2
+[####################################################################################################] 100%
++---------------------+--------+----------+----------+---------+------------+----------+---------------+-----------------------+
+| Operator | #Hosts | Avg Time | Max Time | #Rows | Est. #Rows | Peak Mem | Est. Peak Mem | Detail |
+[localhost:21000] >
+]]>
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/live_progress_live_summary_asciinema"/>
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_load_data.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_load_data.xml b/docs/topics/impala_load_data.xml
new file mode 100644
index 0000000..e3517f0
--- /dev/null
+++ b/docs/topics/impala_load_data.xml
@@ -0,0 +1,237 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.1" id="load_data">
+
+ <title>LOAD DATA Statement</title>
+ <titlealts><navtitle>LOAD DATA</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="ETL"/>
+ <data name="Category" value="Ingest"/>
+ <data name="Category" value="DML"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="HDFS"/>
+ <data name="Category" value="Tables"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">LOAD DATA statement</indexterm>
+ The <codeph>LOAD DATA</codeph> statement streamlines the ETL process for an internal Impala table by moving a
+ data file or all the data files in a directory from an HDFS location into the Impala data directory for that
+ table.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>LOAD DATA INPATH '<varname>hdfs_file_or_directory_path</varname>' [OVERWRITE] INTO TABLE <varname>tablename</varname>
+ [PARTITION (<varname>partcol1</varname>=<varname>val1</varname>, <varname>partcol2</varname>=<varname>val2</varname> ...)]</codeblock>
+
+ <p>
+ When the <codeph>LOAD DATA</codeph> statement operates on a partitioned table,
+ it always operates on one partition at a time. Specify the <codeph>PARTITION</codeph> clauses
+ and list all the partition key columns, with a constant value specified for each.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/dml_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <ul>
+ <li>
+ The loaded data files are moved, not copied, into the Impala data directory.
+ </li>
+
+ <li>
+ You can specify the HDFS path of a single file to be moved, or the HDFS path of a directory to move all the
+ files inside that directory. You cannot specify any sort of wildcard to take only some of the files from a
+ directory. When loading a directory full of data files, keep all the data files at the top level, with no
+ nested directories underneath.
+ </li>
+
+ <li>
+ Currently, the Impala <codeph>LOAD DATA</codeph> statement only imports files from HDFS, not from the local
+ filesystem. It does not support the <codeph>LOCAL</codeph> keyword of the Hive <codeph>LOAD DATA</codeph>
+ statement. You must specify a path, not an <codeph>hdfs://</codeph> URI.
+ </li>
+
+ <li>
+ In the interest of speed, only limited error checking is done. If the loaded files have the wrong file
+ format, different columns than the destination table, or other kind of mismatch, Impala does not raise any
+ error for the <codeph>LOAD DATA</codeph> statement. Querying the table afterward could produce a runtime
+ error or unexpected results. Currently, the only checking the <codeph>LOAD DATA</codeph> statement does is
+ to avoid mixing together uncompressed and LZO-compressed text files in the same table.
+ </li>
+
+ <li>
+ When you specify an HDFS directory name as the <codeph>LOAD DATA</codeph> argument, any hidden files in
+ that directory (files whose names start with a <codeph>.</codeph>) are not moved to the Impala data
+ directory.
+ </li>
+
+ <li>
+ The loaded data files retain their original names in the new location, unless a name conflicts with an
+ existing data file, in which case the name of the new file is modified slightly to be unique. (The
+ name-mangling is a slight difference from the Hive <codeph>LOAD DATA</codeph> statement, which replaces
+ identically named files.)
+ </li>
+
+ <li>
+ By providing an easy way to transport files from known locations in HDFS into the Impala data directory
+ structure, the <codeph>LOAD DATA</codeph> statement lets you avoid memorizing the locations and layout of
+ HDFS directory tree containing the Impala databases and tables. (For a quick way to check the location of
+ the data files for an Impala table, issue the statement <codeph>DESCRIBE FORMATTED
+ <varname>table_name</varname></codeph>.)
+ </li>
+
+ <li>
+ The <codeph>PARTITION</codeph> clause is especially convenient for ingesting new data for a partitioned
+ table. As you receive new data for a time period, geographic region, or other division that corresponds to
+ one or more partitioning columns, you can load that data straight into the appropriate Impala data
+ directory, which might be nested several levels down if the table is partitioned by multiple columns. When
+ the table is partitioned, you must specify constant values for all the partitioning columns.
+ </li>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p rev="2.3.0">
+ Because Impala currently cannot create Parquet data files containing complex types
+ (<codeph>ARRAY</codeph>, <codeph>STRUCT</codeph>, and <codeph>MAP</codeph>), the
+ <codeph>LOAD DATA</codeph> statement is especially important when working with
+ tables containing complex type columns. You create the Parquet data files outside
+ Impala, then use either <codeph>LOAD DATA</codeph>, an external table, or HDFS-level
+ file operations followed by <codeph>REFRESH</codeph> to associate the data files with
+ the corresponding table.
+ See <xref href="impala_complex_types.xml#complex_types"/> for details about using complex types.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/>
+
+ <note conref="../shared/impala_common.xml#common/compute_stats_next"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ First, we use a trivial Python script to write different numbers of strings (one per line) into files stored
+ in the <codeph>cloudera</codeph> HDFS user account. (Substitute the path for your own HDFS user account when
+ doing <cmdname>hdfs dfs</cmdname> operations like these.)
+ </p>
+
+<codeblock>$ random_strings.py 1000 | hdfs dfs -put - /user/cloudera/thousand_strings.txt
+$ random_strings.py 100 | hdfs dfs -put - /user/cloudera/hundred_strings.txt
+$ random_strings.py 10 | hdfs dfs -put - /user/cloudera/ten_strings.txt</codeblock>
+
+ <p>
+ Next, we create a table and load an initial set of data into it. Remember, unless you specify a
+ <codeph>STORED AS</codeph> clause, Impala tables default to <codeph>TEXTFILE</codeph> format with Ctrl-A (hex
+ 01) as the field delimiter. This example uses a single-column table, so the delimiter is not significant. For
+ large-scale ETL jobs, you would typically use binary format data files such as Parquet or Avro, and load them
+ into Impala tables that use the corresponding file format.
+ </p>
+
+<codeblock>[localhost:21000] > create table t1 (s string);
+[localhost:21000] > load data inpath '/user/cloudera/thousand_strings.txt' into table t1;
+Query finished, fetching results ...
++----------------------------------------------------------+
+| summary |
++----------------------------------------------------------+
+| Loaded 1 file(s). Total files in destination location: 1 |
++----------------------------------------------------------+
+Returned 1 row(s) in 0.61s
+[kilo2-202-961.cs1cloud.internal:21000] > select count(*) from t1;
+Query finished, fetching results ...
++------+
+| _c0 |
++------+
+| 1000 |
++------+
+Returned 1 row(s) in 0.67s
+[localhost:21000] > load data inpath '/user/cloudera/thousand_strings.txt' into table t1;
+ERROR: AnalysisException: INPATH location '/user/cloudera/thousand_strings.txt' does not exist. </codeblock>
+
+ <p>
+ As indicated by the message at the end of the previous example, the data file was moved from its original
+ location. The following example illustrates how the data file was moved into the Impala data directory for
+ the destination table, keeping its original filename:
+ </p>
+
+<codeblock>$ hdfs dfs -ls /user/hive/warehouse/load_data_testing.db/t1
+Found 1 items
+-rw-r--r-- 1 cloudera cloudera 13926 2013-06-26 15:40 /user/hive/warehouse/load_data_testing.db/t1/thousand_strings.txt</codeblock>
+
+ <p>
+ The following example demonstrates the difference between the <codeph>INTO TABLE</codeph> and
+ <codeph>OVERWRITE TABLE</codeph> clauses. The table already contains 1000 rows. After issuing the
+ <codeph>LOAD DATA</codeph> statement with the <codeph>INTO TABLE</codeph> clause, the table contains 100 more
+ rows, for a total of 1100. After issuing the <codeph>LOAD DATA</codeph> statement with the <codeph>OVERWRITE
+ INTO TABLE</codeph> clause, the former contents are gone, and now the table only contains the 10 rows from
+ the just-loaded data file.
+ </p>
+
+<codeblock>[localhost:21000] > load data inpath '/user/cloudera/hundred_strings.txt' into table t1;
+Query finished, fetching results ...
++----------------------------------------------------------+
+| summary |
++----------------------------------------------------------+
+| Loaded 1 file(s). Total files in destination location: 2 |
++----------------------------------------------------------+
+Returned 1 row(s) in 0.24s
+[localhost:21000] > select count(*) from t1;
+Query finished, fetching results ...
++------+
+| _c0 |
++------+
+| 1100 |
++------+
+Returned 1 row(s) in 0.55s
+[localhost:21000] > load data inpath '/user/cloudera/ten_strings.txt' overwrite into table t1;
+Query finished, fetching results ...
++----------------------------------------------------------+
+| summary |
++----------------------------------------------------------+
+| Loaded 1 file(s). Total files in destination location: 1 |
++----------------------------------------------------------+
+Returned 1 row(s) in 0.26s
+[localhost:21000] > select count(*) from t1;
+Query finished, fetching results ...
++-----+
+| _c0 |
++-----+
+| 10 |
++-----+
+Returned 1 row(s) in 0.62s</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/s3_blurb"/>
+ <p conref="../shared/impala_common.xml#common/s3_dml"/>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
+ <p rev="CDH-19187">
+ The user ID that the <cmdname>impalad</cmdname> daemon runs under,
+ typically the <codeph>impala</codeph> user, must have read and write
+ permissions for the files in the source directory, and write
+ permission for the destination directory.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+ <p>
+ The <codeph>LOAD DATA</codeph> statement is an alternative to the
+ <codeph>INSERT</codeph> statement. Use <codeph>LOAD DATA</codeph>
+ when you have the data files in HDFS but outside of any Impala table.
+ </p>
+ <p>
+ The <codeph>LOAD DATA</codeph> statement is also an alternative
+ to the <codeph>CREATE EXTERNAL TABLE</codeph> statement. Use
+ <codeph>LOAD DATA</codeph> when it is appropriate to move the
+ data files under Impala control rather than querying them
+ from their original location.
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_map.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_map.xml b/docs/topics/impala_map.xml
new file mode 100644
index 0000000..41e4754
--- /dev/null
+++ b/docs/topics/impala_map.xml
@@ -0,0 +1,264 @@
+<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+ <concept id="map">
+
+ <title>MAP Complex Type (CDH 5.5 or higher only)</title>
+
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Data Types"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ A complex data type representing an arbitrary set of key-value pairs.
+ The key part is a scalar type, while the value part can be a scalar or
+ another complex type (<codeph>ARRAY</codeph>, <codeph>STRUCT</codeph>,
+ or <codeph>MAP</codeph>).
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock><varname>column_name</varname> MAP < <varname>primitive_type</varname>, <varname>type</varname> >
+
+type ::= <varname>primitive_type</varname> | <varname>complex_type</varname>
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_combo"/>
+
+ <p>
+ The <codeph>MAP</codeph> complex data type represents a set of key-value pairs.
+ Each element of the map is indexed by a primitive type such as <codeph>BIGINT</codeph> or
+ <codeph>STRING</codeph>, letting you define sequences that are not continuous or categories with arbitrary names.
+ You might find it convenient for modelling data produced in other languages, such as a
+ Python dictionary or Java HashMap, where a single scalar value serves as the lookup key.
+ </p>
+
+ <p>
+ In a big data context, the keys in a map column might represent a numeric sequence of events during a
+ manufacturing process, or <codeph>TIMESTAMP</codeph> values corresponding to sensor observations.
+ The map itself is inherently unordered, so you choose whether to make the key values significant
+ (such as a recorded <codeph>TIMESTAMP</codeph>) or synthetic (such as a random global universal ID).
+ </p>
+
+ <note>
+ Behind the scenes, the <codeph>MAP</codeph> type is implemented in a similar way as the
+ <codeph>ARRAY</codeph> type. Impala does not enforce any uniqueness constraint on the
+ <codeph>KEY</codeph> values, and the <codeph>KEY</codeph> values are processed by
+ looping through the elements of the <codeph>MAP</codeph> rather than by a constant-time lookup.
+ Therefore, this type is primarily for ease of understanding when importing data and
+ algorithms from non-SQL contexts, rather than optimizing the performance of key lookups.
+ </note>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_describe"/>
+
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <ul conref="../shared/impala_common.xml#common/complex_types_restrictions">
+ <li/>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <note conref="../shared/impala_common.xml#common/complex_type_schema_pointer"/>
+
+ <p>
+ The following example shows a table with various kinds of <codeph>MAP</codeph> columns,
+ both at the top level and nested within other complex types.
+ Each row represents information about a specific country, with complex type fields
+ of various levels of nesting to represent different information associated
+ with the country: factual measurements such as area and population,
+ notable people in different categories, geographic features such as
+ cities, points of interest within each city, and mountains with associated facts.
+ Practice the <codeph>CREATE TABLE</codeph> and query notation for complex type columns
+ using empty tables, until you can visualize a complex data structure and construct corresponding SQL statements reliably.
+ </p>
+
+<codeblock><![CDATA[create TABLE map_demo
+(
+ country_id BIGINT,
+
+-- Numeric facts about each country, looked up by name.
+-- For example, 'Area':1000, 'Population':999999.
+-- Using a MAP instead of a STRUCT because there could be
+-- a different set of facts for each country.
+ metrics MAP <STRING, BIGINT>,
+
+-- MAP whose value part is an ARRAY.
+-- For example, the key 'Famous Politicians' could represent an array of 10 elements,
+-- while the key 'Famous Actors' could represent an array of 20 elements.
+ notables MAP <STRING, ARRAY <STRING>>,
+
+-- MAP that is a field within a STRUCT.
+-- (The STRUCT is inside another ARRAY, because it is rare
+-- for a STRUCT to be a top-level column.)
+-- For example, city #1 might have points of interest with key 'Zoo',
+-- representing an array of 3 different zoos.
+-- City #2 might have completely different kinds of points of interest.
+-- Because the set of field names is potentially large, and most entries could be blank,
+-- a MAP makes more sense than a STRUCT to represent such a sparse data structure.
+ cities ARRAY < STRUCT <
+ name: STRING,
+ points_of_interest: MAP <STRING, ARRAY <STRING>>
+ >>,
+
+-- MAP that is an element within an ARRAY. The MAP is inside a STRUCT field to associate
+-- the mountain name with all the facts about the mountain.
+-- The "key" of the map (the first STRING field) represents the name of some fact whose value
+-- can be expressed as an integer, such as 'Height', 'Year First Climbed', and so on.
+ mountains ARRAY < STRUCT < name: STRING, facts: MAP <STRING, INT > > >
+)
+STORED AS PARQUET;
+]]>
+</codeblock>
+
+<codeblock><![CDATA[DESCRIBE map_demo;
++------------+------------------------------------------------+
+| name | type |
++------------+------------------------------------------------+
+| country_id | bigint |
+| metrics | map<string,bigint> |
+| notables | map<string,array<string>> |
+| cities | array<struct< |
+| | name:string, |
+| | points_of_interest:map<string,array<string>> |
+| | >> |
+| mountains | array<struct< |
+| | name:string, |
+| | facts:map<string,int> |
+| | >> |
++------------+------------------------------------------------+
+
+DESCRIBE map_demo.metrics;
++-------+--------+
+| name | type |
++-------+--------+
+| key | string |
+| value | bigint |
++-------+--------+
+
+DESCRIBE map_demo.notables;
++-------+---------------+
+| name | type |
++-------+---------------+
+| key | string |
+| value | array<string> |
++-------+---------------+
+
+DESCRIBE map_demo.notables.value;
++------+--------+
+| name | type |
++------+--------+
+| item | string |
+| pos | bigint |
++------+--------+
+
+DESCRIBE map_demo.cities;
++------+------------------------------------------------+
+| name | type |
++------+------------------------------------------------+
+| item | struct< |
+| | name:string, |
+| | points_of_interest:map<string,array<string>> |
+| | > |
+| pos | bigint |
++------+------------------------------------------------+
+
+DESCRIBE map_demo.cities.item.points_of_interest;
++-------+---------------+
+| name | type |
++-------+---------------+
+| key | string |
+| value | array<string> |
++-------+---------------+
+
+DESCRIBE map_demo.cities.item.points_of_interest.value;
++------+--------+
+| name | type |
++------+--------+
+| item | string |
+| pos | bigint |
++------+--------+
+
+DESCRIBE map_demo.mountains;
++------+-------------------------+
+| name | type |
++------+-------------------------+
+| item | struct< |
+| | name:string, |
+| | facts:map<string,int> |
+| | > |
+| pos | bigint |
++------+-------------------------+
+
+DESCRIBE map_demo.mountains.item.facts;
++-------+--------+
+| name | type |
++-------+--------+
+| key | string |
+| value | int |
++-------+--------+
+]]>
+</codeblock>
+
+ <p>
+ The following example shows a table that uses a variety of data types for the <codeph>MAP</codeph>
+ <q>key</q> field. Typically, you use <codeph>BIGINT</codeph> or <codeph>STRING</codeph> to use
+ numeric or character-based keys without worrying about exceeding any size or length constraints.
+ </p>
+
+<codeblock><![CDATA[CREATE TABLE map_demo_obscure
+(
+ id BIGINT,
+ m1 MAP <INT, INT>,
+ m2 MAP <SMALLINT, INT>,
+ m3 MAP <TINYINT, INT>,
+ m4 MAP <TIMESTAMP, INT>,
+ m5 MAP <BOOLEAN, INT>,
+ m6 MAP <CHAR(5), INT>,
+ m7 MAP <VARCHAR(25), INT>,
+ m8 MAP <FLOAT, INT>,
+ m9 MAP <DOUBLE, INT>,
+ m10 MAP <DECIMAL(12,2), INT>
+)
+STORED AS PARQUET;
+]]>
+</codeblock>
+
+<codeblock>CREATE TABLE celebrities (name STRING, birth_year MAP < STRING, SMALLINT >) STORED AS PARQUET;
+-- A typical row might represent values with 2 different birth years, such as:
+-- ("Joe Movie Star", { "real": 1972, "claimed": 1977 })
+
+CREATE TABLE countries (name STRING, famous_leaders MAP < INT, STRING >) STORED AS PARQUET;
+-- A typical row might represent values with different leaders, with key values corresponding to their numeric sequence, such as:
+-- ("United States", { 1: "George Washington", 3: "Thomas Jefferson", 16: "Abraham Lincoln" })
+
+CREATE TABLE airlines (name STRING, special_meals MAP < STRING, MAP < STRING, STRING > >) STORED AS PARQUET;
+-- A typical row might represent values with multiple kinds of meals, each with several components:
+-- ("Elegant Airlines",
+-- {
+-- "vegetarian": { "breakfast": "pancakes", "snack": "cookies", "dinner": "rice pilaf" },
+-- "gluten free": { "breakfast": "oatmeal", "snack": "fruit", "dinner": "chicken" }
+-- } )
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_complex_types.xml#complex_types"/>,
+ <xref href="impala_array.xml#array"/>,
+ <xref href="impala_struct.xml#struct"/>
+ <!-- <xref href="impala_map.xml#map"/> -->
+ </p>
+
+ </conbody>
+
+ </concept>
+
+
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_math_functions.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_math_functions.xml b/docs/topics/impala_math_functions.xml
new file mode 100644
index 0000000..fd16b37
--- /dev/null
+++ b/docs/topics/impala_math_functions.xml
@@ -0,0 +1,1336 @@
+<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="math_functions">
+
+ <title>Impala Mathematical Functions</title>
+ <titlealts><navtitle>Mathematical Functions</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ Mathematical functions, or arithmetic functions, perform numeric calculations that are typically more complex
+ than basic addition, subtraction, multiplication, and division. For example, these functions include
+ trigonometric, logarithmic, and base conversion operations.
+ </p>
+
+ <note>
+ In Impala, exponentiation uses the <codeph>pow()</codeph> function rather than an exponentiation operator
+ such as <codeph>**</codeph>.
+ </note>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ The mathematical functions operate mainly on these data types: <xref href="impala_int.xml#int"/>,
+ <xref href="impala_bigint.xml#bigint"/>, <xref href="impala_smallint.xml#smallint"/>,
+ <xref href="impala_tinyint.xml#tinyint"/>, <xref href="impala_double.xml#double"/>,
+ <xref href="impala_float.xml#float"/>, and <xref href="impala_decimal.xml#decimal"/>. For the operators that
+ perform the standard operations such as addition, subtraction, multiplication, and division, see
+ <xref href="impala_operators.xml#arithmetic_operators"/>.
+ </p>
+
+ <p>
+ Functions that perform bitwise operations are explained in <xref href="impala_bit_functions.xml#bit_functions"/>.
+ </p>
+
+ <p>
+ <b>Function reference:</b>
+ </p>
+
+ <p>
+ Impala supports the following mathematical functions:
+ </p>
+
+ <dl>
+ <dlentry rev="1.4.0" id="abs">
+
+ <dt rev="2.0.1">
+ <codeph>abs(numeric_type a)</codeph>
+<!-- <codeph>abs(double a), abs(decimal(p,s) a)</codeph> -->
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">abs() function</indexterm>
+ <b>Purpose:</b> Returns the absolute value of the argument.
+ <p rev="2.0.1" conref="../shared/impala_common.xml#common/return_type_same"/>
+ <p>
+ <b>Usage notes:</b> Use this function to ensure all return values are positive. This is different than
+ the <codeph>positive()</codeph> function, which returns its argument unchanged (even if the argument
+ was negative).
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="acos">
+
+ <dt>
+ <codeph>acos(double a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">acos() function</indexterm>
+ <b>Purpose:</b> Returns the arccosine of the argument.
+ <p>
+ <b>Return type:</b> <codeph>double</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="asin">
+
+ <dt>
+ <codeph>asin(double a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">asin() function</indexterm>
+ <b>Purpose:</b> Returns the arcsine of the argument.
+ <p>
+ <b>Return type:</b> <codeph>double</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="atan">
+
+ <dt>
+ <codeph>atan(double a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">atan() function</indexterm>
+ <b>Purpose:</b> Returns the arctangent of the argument.
+ <p>
+ <b>Return type:</b> <codeph>double</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="bin">
+
+ <dt>
+ <codeph>bin(bigint a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">bin() function</indexterm>
+ <b>Purpose:</b> Returns the binary representation of an integer value, that is, a string of 0 and 1
+ digits.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.4.0" id="ceil">
+
+ <dt>
+ <codeph>ceil(double a)</codeph>,
+ <codeph>ceil(decimal(p,s) a)</codeph>,
+ <codeph id="ceiling">ceiling(double a)</codeph>,
+ <codeph>ceiling(decimal(p,s) a)</codeph>,
+ <codeph id="dceil" rev="2.3.0">dceil(double a)</codeph>,
+ <codeph rev="2.3.0">dceil(decimal(p,s) a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">ceil() function</indexterm>
+ <b>Purpose:</b> Returns the smallest integer that is greater than or equal to the argument.
+ <p>
+ <b>Return type:</b> <codeph>int</codeph> or <codeph>decimal(p,s)</codeph> depending on the type of the
+ input argument
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="conv">
+
+ <dt>
+ <codeph>conv(bigint num, int from_base, int to_base), conv(string num, int from_base, int
+ to_base)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">conv() function</indexterm>
+ <b>Purpose:</b> Returns a string representation of an integer value in a particular base. The input value
+ can be a string, for example to convert a hexadecimal number such as <codeph>fce2</codeph> to decimal. To
+ use the return value as a number (for example, when converting to base 10), use <codeph>CAST()</codeph>
+ to convert to the appropriate type.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="cos">
+
+ <dt>
+ <codeph>cos(double a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">cos() function</indexterm>
+ <b>Purpose:</b> Returns the cosine of the argument.
+ <p>
+ <b>Return type:</b> <codeph>double</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="cot" rev="2.3.0">
+
+ <dt>
+ <codeph>cot(double a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">cot() function</indexterm>
+ <b>Purpose:</b> Returns the cotangent of the argument.
+ <p>
+ <b>Return type:</b> <codeph>double</codeph>
+ </p>
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="degrees">
+
+ <dt>
+ <codeph>degrees(double a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">degrees() function</indexterm>
+ <b>Purpose:</b> Converts argument value from radians to degrees.
+ <p>
+ <b>Return type:</b> <codeph>double</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="e">
+
+ <dt>
+ <codeph>e()</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">e() function</indexterm>
+ <b>Purpose:</b> Returns the
+ <xref href="http://en.wikipedia.org/wiki/E_(mathematical_constant)" scope="external" format="html">mathematical
+ constant e</xref>.
+ <p>
+ <b>Return type:</b> <codeph>double</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="exp">
+
+ <dt>
+ <codeph>exp(double a)</codeph>,
+ <codeph rev="2.3.0" id="dexp">dexp(double a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">exp() function</indexterm>
+ <b>Purpose:</b> Returns the
+ <xref href="http://en.wikipedia.org/wiki/E_(mathematical_constant)" scope="external" format="html">mathematical
+ constant e</xref> raised to the power of the argument.
+ <p>
+ <b>Return type:</b> <codeph>double</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="2.3.0" id="factorial">
+
+ <dt>
+ <codeph>factorial(integer_type a)</codeph>
+ </dt>
+ <dd>
+ <indexterm audience="Cloudera">factorial() function</indexterm>
+ <b>Purpose:</b> Computes the <xref href="https://en.wikipedia.org/wiki/Factorial" scope="external" format="html">factorial</xref> of an integer value.
+ It works with any integer type.
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+ <p>
+ <b>Usage notes:</b> You can use either the <codeph>factorial()</codeph> function or the <codeph>!</codeph> operator.
+ The factorial of 0 is 1. Likewise, the <codeph>factorial()</codeph> function returns 1 for any negative value.
+ The maximum positive value for the input argument is 20; a value of 21 or greater overflows the
+ range for a <codeph>BIGINT</codeph> and causes an error.
+ </p>
+ <p>
+ <b>Return type:</b> <codeph>bigint</codeph>
+ </p>
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+<codeblock>select factorial(5);
++--------------+
+| factorial(5) |
++--------------+
+| 120 |
++--------------+
+
+select 5!;
++-----+
+| 5! |
++-----+
+| 120 |
++-----+
+
+select factorial(0);
++--------------+
+| factorial(0) |
++--------------+
+| 1 |
++--------------+
+
+select factorial(-100);
++-----------------+
+| factorial(-100) |
++-----------------+
+| 1 |
++-----------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="floor">
+
+ <dt>
+ <codeph>floor(double a)</codeph>,
+ <codeph>floor(decimal(p,s) a)</codeph>,
+ <codeph rev="2.3.0" id="dfloor">dfloor(double a)</codeph>,
+ <codeph rev="2.3.0">dfloor(decimal(p,s) a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">floor() function</indexterm>
+ <b>Purpose:</b> Returns the largest integer that is less than or equal to the argument.
+ <p>
+ <b>Return type:</b> <codeph>bigint</codeph> or <codeph>decimal(p,s)</codeph> depending on the type of
+ the input argument
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="fmod">
+
+ <dt>
+ <codeph>fmod(double a, double b), fmod(float a, float b)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">fmod() function</indexterm>
+ <b>Purpose:</b> Returns the modulus of a floating-point number. Equivalent to the <codeph>%</codeph> arithmetic operator.
+ <p>
+ <b>Return type:</b> <codeph>float</codeph> or <codeph>double</codeph>, depending on type of arguments
+ </p>
+ <p conref="../shared/impala_common.xml#common/added_in_111"/>
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ Because this function operates on <codeph>DOUBLE</codeph> or <codeph>FLOAT</codeph>
+ values, it is subject to potential rounding errors for values that cannot be
+ represented precisely. Prefer to use whole numbers, or values that you know
+ can be represented precisely by the <codeph>DOUBLE</codeph> or <codeph>FLOAT</codeph>
+ types.
+ </p>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p>
+ The following examples show equivalent operations with the <codeph>fmod()</codeph>
+ function and the <codeph>%</codeph> arithmetic operator, for values not subject
+ to any rounding error.
+ </p>
+<codeblock>select fmod(10,3);
++-------------+
+| fmod(10, 3) |
++-------------+
+| 1 |
++-------------+
+
+select fmod(5.5,2);
++--------------+
+| fmod(5.5, 2) |
++--------------+
+| 1.5 |
++--------------+
+
+select 10 % 3;
++--------+
+| 10 % 3 |
++--------+
+| 1 |
++--------+
+
+select 5.5 % 2;
++---------+
+| 5.5 % 2 |
++---------+
+| 1.5 |
++---------+
+</codeblock>
+ <p>
+ The following examples show operations with the <codeph>fmod()</codeph>
+ function for values that cannot be represented precisely by the
+ <codeph>DOUBLE</codeph> or <codeph>FLOAT</codeph> types, and thus are
+ subject to rounding error. <codeph>fmod(9.9,3.0)</codeph> returns a value
+ slightly different than the expected 0.9 because of rounding.
+ <codeph>fmod(9.9,3.3)</codeph> returns a value quite different from
+ the expected value of 0 because of rounding error during intermediate
+ calculations.
+ </p>
+<codeblock>select fmod(9.9,3.0);
++--------------------+
+| fmod(9.9, 3.0) |
++--------------------+
+| 0.8999996185302734 |
++--------------------+
+
+select fmod(9.9,3.3);
++-------------------+
+| fmod(9.9, 3.3) |
++-------------------+
+| 3.299999713897705 |
++-------------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.2.2" id="fnv_hash">
+
+ <dt>
+ <codeph>fnv_hash(type v)</codeph>,
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">fnv_hash() function</indexterm>
+ <b>Purpose:</b> Returns a consistent 64-bit value derived from the input argument, for convenience of
+ implementing hashing logic in an application.
+ <p>
+ <b>Return type:</b> <codeph>BIGINT</codeph>
+ </p>
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ You might use the return value in an application where you perform load balancing, bucketing, or some
+ other technique to divide processing or storage.
+ </p>
+ <p>
+ Because the result can be any 64-bit value, to restrict the value to a particular range, you can use an
+ expression that includes the <codeph>ABS()</codeph> function and the <codeph>%</codeph> (modulo)
+ operator. For example, to produce a hash value in the range 0-9, you could use the expression
+ <codeph>ABS(FNV_HASH(x)) % 10</codeph>.
+ </p>
+ <p>
+ This function implements the same algorithm that Impala uses internally for hashing, on systems where
+ the CRC32 instructions are not available.
+ </p>
+ <p>
+ This function implements the
+ <xref href="http://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function" scope="external" format="html">Fowler\u2013Noll\u2013Vo
+ hash function</xref>, in particular the FNV-1a variation. This is not a perfect hash function: some
+ combinations of values could produce the same result value. It is not suitable for cryptographic use.
+ </p>
+ <p>
+ Similar input values of different types could produce different hash values, for example the same
+ numeric value represented as <codeph>SMALLINT</codeph> or <codeph>BIGINT</codeph>,
+ <codeph>FLOAT</codeph> or <codeph>DOUBLE</codeph>, or <codeph>DECIMAL(5,2)</codeph> or
+ <codeph>DECIMAL(20,5)</codeph>.
+ </p>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+<codeblock>[localhost:21000] > create table h (x int, s string);
+[localhost:21000] > insert into h values (0, 'hello'), (1,'world'), (1234567890,'antidisestablishmentarianism');
+[localhost:21000] > select x, fnv_hash(x) from h;
++------------+----------------------+
+| x | fnv_hash(x) |
++------------+----------------------+
+| 0 | -2611523532599129963 |
+| 1 | 4307505193096137732 |
+| 1234567890 | 3614724209955230832 |
++------------+----------------------+
+[localhost:21000] > select s, fnv_hash(s) from h;
++------------------------------+---------------------+
+| s | fnv_hash(s) |
++------------------------------+---------------------+
+| hello | 6414202926103426347 |
+| world | 6535280128821139475 |
+| antidisestablishmentarianism | -209330013948433970 |
++------------------------------+---------------------+
+[localhost:21000] > select s, abs(fnv_hash(s)) % 10 from h;
++------------------------------+-------------------------+
+| s | abs(fnv_hash(s)) % 10.0 |
++------------------------------+-------------------------+
+| hello | 8 |
+| world | 6 |
+| antidisestablishmentarianism | 4 |
++------------------------------+-------------------------+</codeblock>
+ <p>
+ For short argument values, the high-order bits of the result have relatively low entropy:
+ </p>
+<codeblock>[localhost:21000] > create table b (x boolean);
+[localhost:21000] > insert into b values (true), (true), (false), (false);
+[localhost:21000] > select x, fnv_hash(x) from b;
++-------+---------------------+
+| x | fnv_hash(x) |
++-------+---------------------+
+| true | 2062020650953872396 |
+| true | 2062020650953872396 |
+| false | 2062021750465500607 |
+| false | 2062021750465500607 |
++-------+---------------------+</codeblock>
+ <p>
+ <b>Added in:</b> Impala 1.2.2
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.4.0" id="greatest">
+
+ <dt>
+ <codeph>greatest(bigint a[, bigint b ...])</codeph>, <codeph>greatest(double a[, double b ...])</codeph>,
+ <codeph>greatest(decimal(p,s) a[, decimal(p,s) b ...])</codeph>, <codeph>greatest(string a[, string b
+ ...])</codeph>, <codeph>greatest(timestamp a[, timestamp b ...])</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">greatest() function</indexterm>
+ <b>Purpose:</b> Returns the largest value from a list of expressions.
+ <p conref="../shared/impala_common.xml#common/return_same_type"/>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="hex">
+
+ <dt>
+ <codeph>hex(bigint a), hex(string a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">hex() function</indexterm>
+ <b>Purpose:</b> Returns the hexadecimal representation of an integer value, or of the characters in a
+ string.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.4.0" id="is_inf">
+
+ <dt>
+ <codeph>is_inf(double a)</codeph>,
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">is_inf() function</indexterm>
+ <b>Purpose:</b> Tests whether a value is equal to the special value <q>inf</q>, signifying infinity.
+ <p>
+ <b>Return type:</b> <codeph>boolean</codeph>
+ </p>
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ Infinity and NaN can be specified in text data files as <codeph>inf</codeph> and <codeph>nan</codeph>
+ respectively, and Impala interprets them as these special values. They can also be produced by certain
+ arithmetic expressions; for example, <codeph>pow(-1, 0.5)</codeph> returns infinity and
+ <codeph>1/0</codeph> returns NaN. Or you can cast the literal values, such as <codeph>CAST('nan' AS
+ DOUBLE)</codeph> or <codeph>CAST('inf' AS DOUBLE)</codeph>.
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.4.0" id="is_nan">
+
+ <dt>
+ <codeph>is_nan(double a)</codeph>,
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">is_nan() function</indexterm>
+ <b>Purpose:</b> Tests whether a value is equal to the special value <q>NaN</q>, signifying <q>not a
+ number</q>.
+ <p>
+ <b>Return type:</b> <codeph>boolean</codeph>
+ </p>
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ Infinity and NaN can be specified in text data files as <codeph>inf</codeph> and <codeph>nan</codeph>
+ respectively, and Impala interprets them as these special values. They can also be produced by certain
+ arithmetic expressions; for example, <codeph>pow(-1, 0.5)</codeph> returns infinity and
+ <codeph>1/0</codeph> returns NaN. Or you can cast the literal values, such as <codeph>CAST('nan' AS
+ DOUBLE)</codeph> or <codeph>CAST('inf' AS DOUBLE)</codeph>.
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.4.0" id="least">
+
+ <dt>
+ <codeph>least(bigint a[, bigint b ...])</codeph>, <codeph>least(double a[, double b ...])</codeph>,
+ <codeph>least(decimal(p,s) a[, decimal(p,s) b ...])</codeph>, <codeph>least(string a[, string b
+ ...])</codeph>, <codeph>least(timestamp a[, timestamp b ...])</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">least() function</indexterm>
+ <b>Purpose:</b> Returns the smallest value from a list of expressions.
+ <p conref="../shared/impala_common.xml#common/return_same_type"/>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="ln">
+
+ <dt>
+ <codeph>ln(double a)</codeph>,
+ <codeph rev="2.3.0" id="dlog1">dlog1(double a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">ln() function</indexterm>
+ <indexterm audience="Cloudera">dlog1() function</indexterm>
+ <b>Purpose:</b> Returns the
+ <xref href="https://en.wikipedia.org/wiki/Natural_logarithm" scope="external" format="html">natural
+ logarithm</xref> of the argument.
+ <p>
+ <b>Return type:</b> <codeph>double</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="log">
+
+ <dt>
+ <codeph>log(double base, double a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">log() function</indexterm>
+ <b>Purpose:</b> Returns the logarithm of the second argument to the specified base.
+ <p>
+ <b>Return type:</b> <codeph>double</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="log10">
+
+ <dt>
+ <codeph>log10(double a)</codeph>,
+ <codeph rev="2.3.0" id="dlog10">dlog10(double a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">log10() function</indexterm>
+ <indexterm audience="Cloudera">dlog10() function</indexterm>
+ <b>Purpose:</b> Returns the logarithm of the argument to the base 10.
+ <p>
+ <b>Return type:</b> <codeph>double</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="log2">
+
+ <dt>
+ <codeph>log2(double a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">log2() function</indexterm>
+ <b>Purpose:</b> Returns the logarithm of the argument to the base 2.
+ <p>
+ <b>Return type:</b> <codeph>double</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.4.0" id="max_int">
+
+ <dt>
+ <codeph>max_int(), <ph id="max_tinyint">max_tinyint()</ph>, <ph id="max_smallint">max_smallint()</ph>,
+ <ph id="max_bigint">max_bigint()</ph></codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">max_int() function</indexterm>
+ <indexterm audience="Cloudera">max_tinyint() function</indexterm>
+ <indexterm audience="Cloudera">max_smallint() function</indexterm>
+ <indexterm audience="Cloudera">max_bigint() function</indexterm>
+ <b>Purpose:</b> Returns the largest value of the associated integral type.
+ <p>
+ <b>Return type:</b> The same as the integral type being checked.
+ </p>
+ <p>
+<!-- Repeated usage text between max_ and min_ functions, could turn into a conref. -->
+ <b>Usage notes:</b> Use the corresponding <codeph>min_</codeph> and <codeph>max_</codeph> functions to
+ check if all values in a column are within the allowed range, before copying data or altering column
+ definitions. If not, switch to the next higher integral type or to a <codeph>DECIMAL</codeph> with
+ sufficient precision.
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.4.0" id="min_int">
+
+ <dt>
+ <codeph>min_int(), <ph id="min_tinyint">min_tinyint()</ph>, <ph id="min_smallint">min_smallint()</ph>,
+ <ph id="min_bigint">min_bigint()</ph></codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">min_int() function</indexterm>
+ <indexterm audience="Cloudera">min_tinyint() function</indexterm>
+ <indexterm audience="Cloudera">min_smallint() function</indexterm>
+ <indexterm audience="Cloudera">min_bigint() function</indexterm>
+ <b>Purpose:</b> Returns the smallest value of the associated integral type (a negative number).
+ <p>
+ <b>Return type:</b> The same as the integral type being checked.
+ </p>
+ <p>
+ <b>Usage notes:</b> Use the corresponding <codeph>min_</codeph> and <codeph>max_</codeph> functions to
+ check if all values in a column are within the allowed range, before copying data or altering column
+ definitions. If not, switch to the next higher integral type or to a <codeph>DECIMAL</codeph> with
+ sufficient precision.
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="mod" rev="2.2.0">
+
+ <dt>
+ <codeph>mod(<varname>numeric_type</varname> a, <varname>same_type</varname> b)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">mod() function</indexterm>
+ <b>Purpose:</b> Returns the modulus of a number. Equivalent to the <codeph>%</codeph> arithmetic operator.
+ Works with any size integer type, any size floating-point type, and <codeph>DECIMAL</codeph>
+ with any precision and scale.
+ <p conref="../shared/impala_common.xml#common/return_type_same"/>
+ <p conref="../shared/impala_common.xml#common/added_in_220"/>
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ Because this function works with <codeph>DECIMAL</codeph> values, prefer it over <codeph>fmod()</codeph>
+ when working with fractional values. It is not subject to the rounding errors that make
+ <codeph>fmod()</codeph> problematic with floating-point numbers.
+ The <codeph>%</codeph> arithmetic operator now uses the <codeph>mod()</codeph> function
+ in cases where its arguments can be interpreted as <codeph>DECIMAL</codeph> values,
+ increasing the accuracy of that operator.
+ </p>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p>
+ The following examples show how the <codeph>mod()</codeph> function works for
+ whole numbers and fractional values, and how the <codeph>%</codeph> operator
+ works the same way. In the case of <codeph>mod(9.9,3)</codeph>,
+ the type conversion for the second argument results in the first argument
+ being interpreted as <codeph>DOUBLE</codeph>, so to produce an accurate
+ <codeph>DECIMAL</codeph> result requires casting the second argument
+ or writing it as a <codeph>DECIMAL</codeph> literal, 3.0.
+ </p>
+<codeblock>select mod(10,3);
++-------------+
+| fmod(10, 3) |
++-------------+
+| 1 |
++-------------+
+
+select mod(5.5,2);
++--------------+
+| fmod(5.5, 2) |
++--------------+
+| 1.5 |
++--------------+
+
+select 10 % 3;
++--------+
+| 10 % 3 |
++--------+
+| 1 |
++--------+
+
+select 5.5 % 2;
++---------+
+| 5.5 % 2 |
++---------+
+| 1.5 |
++---------+
+
+select mod(9.9,3.3);
++---------------+
+| mod(9.9, 3.3) |
++---------------+
+| 0.0 |
++---------------+
+
+select mod(9.9,3);
++--------------------+
+| mod(9.9, 3) |
++--------------------+
+| 0.8999996185302734 |
++--------------------+
+
+select mod(9.9, cast(3 as decimal(2,1)));
++-----------------------------------+
+| mod(9.9, cast(3 as decimal(2,1))) |
++-----------------------------------+
+| 0.9 |
++-----------------------------------+
+
+select mod(9.9,3.0);
++---------------+
+| mod(9.9, 3.0) |
++---------------+
+| 0.9 |
++---------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.4.0" id="negative">
+
+ <dt rev="2.0.1">
+ <codeph>negative(numeric_type a)</codeph>
+<!-- <codeph>negative(int a), negative(double a), negative(decimal(p,s) a)</codeph> -->
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">negative() function</indexterm>
+ <b>Purpose:</b> Returns the argument with the sign reversed; returns a positive value if the argument was
+ already negative.
+ <p rev="2.0.1" conref="../shared/impala_common.xml#common/return_type_same"/>
+<!--
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>, <codeph>double</codeph>,
+ or <codeph>decimal(p,s)</codeph> depending on type of argument
+ </p>
+ -->
+ <p>
+ <b>Usage notes:</b> Use <codeph>-abs(a)</codeph> instead if you need to ensure all return values are
+ negative.
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="pi">
+
+ <dt>
+ <codeph>pi()</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">pi() function</indexterm>
+ <b>Purpose:</b> Returns the constant pi.
+ <p>
+ <b>Return type:</b> <codeph>double</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="pmod">
+
+ <dt>
+ <codeph>pmod(bigint a, bigint b), pmod(double a, double b)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">pmod() function</indexterm>
+ <b>Purpose:</b> Returns the positive modulus of a number.
+ Primarily for <xref href="https://issues.apache.org/jira/browse/HIVE-656" scope="external" format="html">HiveQL compatibility</xref>.
+ <p>
+ <b>Return type:</b> <codeph>int</codeph> or <codeph>double</codeph>, depending on type of arguments
+ </p>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p>
+ The following examples show how the <codeph>fmod()</codeph> function sometimes returns a negative value
+ depending on the sign of its arguments, and the <codeph>pmod()</codeph> function returns the same value
+ as <codeph>fmod()</codeph>, but sometimes with the sign flipped.
+ </p>
+<codeblock>select fmod(-5,2);
++-------------+
+| fmod(-5, 2) |
++-------------+
+| -1 |
++-------------+
+
+select pmod(-5,2);
++-------------+
+| pmod(-5, 2) |
++-------------+
+| 1 |
++-------------+
+
+select fmod(-5,-2);
++--------------+
+| fmod(-5, -2) |
++--------------+
+| -1 |
++--------------+
+
+select pmod(-5,-2);
++--------------+
+| pmod(-5, -2) |
++--------------+
+| -1 |
++--------------+
+
+select fmod(5,-2);
++-------------+
+| fmod(5, -2) |
++-------------+
+| 1 |
++-------------+
+
+select pmod(5,-2);
++-------------+
+| pmod(5, -2) |
++-------------+
+| -1 |
++-------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.4.0" id="positive">
+
+ <dt rev="2.0.1">
+ <codeph>positive(numeric_type a)</codeph>
+<!-- <codeph>positive(int a), positive(double a), positive(decimal(p,s) a</codeph> -->
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">positive() function</indexterm>
+ <b>Purpose:</b> Returns the original argument unchanged (even if the argument is negative).
+ <p rev="2.0.1" conref="../shared/impala_common.xml#common/return_type_same"/>
+<!--
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>, <codeph>double</codeph>,
+ or <codeph>decimal(p,s)</codeph> depending on type of argument
+ </p>
+ -->
+ <p>
+ <b>Usage notes:</b> Use <codeph>abs()</codeph> instead if you need to ensure all return values are
+ positive.
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="pow">
+
+ <dt>
+ <codeph>pow(double a, double p)</codeph>,
+ <codeph id="power">power(double a, double p)</codeph>,
+ <codeph rev="2.3.0" id="dpow">dpow(double a, double p)</codeph>,
+ <codeph rev="2.3.0" id="fpow">fpow(double a, double p)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">pow() function</indexterm>
+ <indexterm audience="Cloudera">power() function</indexterm>
+ <indexterm audience="Cloudera">dpow() function</indexterm>
+ <indexterm audience="Cloudera">fpow() function</indexterm>
+ <b>Purpose:</b> Returns the first argument raised to the power of the second argument.
+ <p>
+ <b>Return type:</b> <codeph>double</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.4.0" id="precision">
+
+ <dt>
+ <codeph>precision(<varname>numeric_expression</varname>)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">precision() function</indexterm>
+ <b>Purpose:</b> Computes the precision (number of decimal digits) needed to represent the type of the
+ argument expression as a <codeph>DECIMAL</codeph> value.
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ Typically used in combination with the <codeph>scale()</codeph> function, to determine the appropriate
+ <codeph>DECIMAL(<varname>precision</varname>,<varname>scale</varname>)</codeph> type to declare in a
+ <codeph>CREATE TABLE</codeph> statement or <codeph>CAST()</codeph> function.
+ </p>
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p conref="../shared/impala_common.xml#common/precision_scale_example"/>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="quotient">
+
+ <dt>
+ <codeph>quotient(int numerator, int denominator)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">quotient() function</indexterm>
+ <b>Purpose:</b> Returns the first argument divided by the second argument, discarding any fractional
+ part. Avoids promoting arguments to <codeph>DOUBLE</codeph> as happens with the <codeph>/</codeph> SQL
+ operator.
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="radians">
+
+ <dt>
+ <codeph>radians(double a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">radians() function</indexterm>
+ <b>Purpose:</b> Converts argument value from degrees to radians.
+ <p>
+ <b>Return type:</b> <codeph>double</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="rand">
+
+ <dt>
+ <codeph>rand()</codeph>, <codeph>rand(int seed)</codeph>,
+ <codeph rev="2.3.0" id="random">random()</codeph>,
+ <codeph rev="2.3.0">random(int seed)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">rand() function</indexterm>
+ <b>Purpose:</b> Returns a random value between 0 and 1. After <codeph>rand()</codeph> is called with a
+ seed argument, it produces a consistent random sequence based on the seed value.
+ <p>
+ <b>Return type:</b> <codeph>double</codeph>
+ </p>
+ <p>
+ <b>Usage notes:</b> Currently, the random sequence is reset after each query, and multiple calls to
+ <codeph>rand()</codeph> within the same query return the same value each time. For different number
+ sequences that are different for each query, pass a unique seed value to each call to
+ <codeph>rand()</codeph>. For example, <codeph>select rand(unix_timestamp()) from ...</codeph>
+ </p>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p>
+ The following examples show how <codeph>rand()</codeph> can produce sequences of varying predictability,
+ so that you can reproduce query results involving random values or generate unique sequences of random
+ values for each query.
+ When <codeph>rand()</codeph> is called with no argument, it generates the same sequence of values each time,
+ regardless of the ordering of the result set.
+ When <codeph>rand()</codeph> is called with a constant integer, it generates a different sequence of values,
+ but still always the same sequence for the same seed value.
+ If you pass in a seed value that changes, such as the return value of the expression <codeph>unix_timestamp(now())</codeph>,
+ each query will use a different sequence of random values, potentially more useful in probability calculations although
+ more difficult to reproduce at a later time. Therefore, the final two examples with an unpredictable seed value
+ also include the seed in the result set, to make it possible to reproduce the same random sequence later.
+ </p>
+<codeblock>select x, rand() from three_rows;
++---+-----------------------+
+| x | rand() |
++---+-----------------------+
+| 1 | 0.0004714746030380365 |
+| 2 | 0.5895895192351144 |
+| 3 | 0.4431900859080209 |
++---+-----------------------+
+
+select x, rand() from three_rows order by x desc;
++---+-----------------------+
+| x | rand() |
++---+-----------------------+
+| 3 | 0.0004714746030380365 |
+| 2 | 0.5895895192351144 |
+| 1 | 0.4431900859080209 |
++---+-----------------------+
+
+select x, rand(1234) from three_rows order by x;
++---+----------------------+
+| x | rand(1234) |
++---+----------------------+
+| 1 | 0.7377511392057646 |
+| 2 | 0.009428468537250751 |
+| 3 | 0.208117277924026 |
++---+----------------------+
+
+select x, rand(1234) from three_rows order by x desc;
++---+----------------------+
+| x | rand(1234) |
++---+----------------------+
+| 3 | 0.7377511392057646 |
+| 2 | 0.009428468537250751 |
+| 1 | 0.208117277924026 |
++---+----------------------+
+
+select x, unix_timestamp(now()), rand(unix_timestamp(now()))
+ from three_rows order by x;
++---+-----------------------+-----------------------------+
+| x | unix_timestamp(now()) | rand(unix_timestamp(now())) |
++---+-----------------------+-----------------------------+
+| 1 | 1440777752 | 0.002051228658320023 |
+| 2 | 1440777752 | 0.5098743483004506 |
+| 3 | 1440777752 | 0.9517714925817081 |
++---+-----------------------+-----------------------------+
+
+select x, unix_timestamp(now()), rand(unix_timestamp(now()))
+ from three_rows order by x desc;
++---+-----------------------+-----------------------------+
+| x | unix_timestamp(now()) | rand(unix_timestamp(now())) |
++---+-----------------------+-----------------------------+
+| 3 | 1440777761 | 0.9985985015512437 |
+| 2 | 1440777761 | 0.3251255333074953 |
+| 1 | 1440777761 | 0.02422675025846192 |
++---+-----------------------+-----------------------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="round">
+
+ <dt>
+ <codeph>round(double a)</codeph>,
+ <codeph>round(double a, int d)</codeph>,
+ <codeph rev="1.4.0">round(decimal a, int_type d)</codeph>,
+ <codeph rev="2.3.0" id="dround">dround(double a)</codeph>,
+ <codeph rev="2.3.0">dround(double a, int d)</codeph>,
+ <codeph rev="2.3.0">dround(decimal(p,s) a, int_type d)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">round() function</indexterm>
+ <indexterm audience="Cloudera">dround() function</indexterm>
+ <b>Purpose:</b> Rounds a floating-point value. By default (with a single argument), rounds to the nearest
+ integer. Values ending in .5 are rounded up for positive numbers, down for negative numbers (that is,
+ away from zero). The optional second argument specifies how many digits to leave after the decimal point;
+ values greater than zero produce a floating-point return value rounded to the requested number of digits
+ to the right of the decimal point.
+ <p rev="1.4.0">
+ <b>Return type:</b> <codeph>bigint</codeph> for single <codeph>double</codeph> argument.
+ <codeph>double</codeph> for two-argument signature when second argument greater than zero.
+ For <codeph>DECIMAL</codeph> values, the smallest
+ <codeph>DECIMAL(<varname>p</varname>,<varname>s</varname>)</codeph> type with appropriate precision and
+ scale.
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.4.0" id="scale">
+
+ <dt>
+ <codeph>scale(<varname>numeric_expression</varname>)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">scale() function</indexterm>
+ <b>Purpose:</b> Computes the scale (number of decimal digits to the right of the decimal point) needed to
+ represent the type of the argument expression as a <codeph>DECIMAL</codeph> value.
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ Typically used in combination with the <codeph>precision()</codeph> function, to determine the
+ appropriate <codeph>DECIMAL(<varname>precision</varname>,<varname>scale</varname>)</codeph> type to
+ declare in a <codeph>CREATE TABLE</codeph> statement or <codeph>CAST()</codeph> function.
+ </p>
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p conref="../shared/impala_common.xml#common/precision_scale_example"/>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="sign">
+
+ <dt>
+ <codeph>sign(double a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">sign() function</indexterm>
+ <b>Purpose:</b> Returns -1, 0, or 1 to indicate the signedness of the argument value.
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="sin">
+
+ <dt>
+ <codeph>sin(double a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">sin() function</indexterm>
+ <b>Purpose:</b> Returns the sine of the argument.
+ <p>
+ <b>Return type:</b> <codeph>double</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="sqrt">
+
+ <dt>
+ <codeph>sqrt(double a)</codeph>,
+ <codeph rev="2.3.0" id="dsqrt">dsqrt(double a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">sqrt() function</indexterm>
+ <indexterm audience="Cloudera">dsqrt() function</indexterm>
+ <b>Purpose:</b> Returns the square root of the argument.
+ <p>
+ <b>Return type:</b> <codeph>double</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="tan">
+
+ <dt>
+ <codeph>tan(double a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">tan() function</indexterm>
+ <b>Purpose:</b> Returns the tangent of the argument.
+ <p>
+ <b>Return type:</b> <codeph>double</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="2.3.0" id="truncate">
+
+ <dt>
+ <codeph>truncate(double_or_decimal a[, digits_to_leave])</codeph>,
+ <ph id="dtrunc"><codeph>dtrunc(double_or_decimal a[, digits_to_leave])</codeph></ph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">truncate() function</indexterm>
+ <indexterm audience="Cloudera">dtrunc() function</indexterm>
+ <b>Purpose:</b> Removes some or all fractional digits from a numeric value.
+ With no argument, removes all fractional digits, leaving an integer value.
+ The optional argument specifies the number of fractional digits to include
+ in the return value, and only applies with the argument type is <codeph>DECIMAL</codeph>.
+ <codeph>truncate()</codeph> and <codeph>dtrunc()</codeph> are aliases for the same function.
+ <p>
+ <b>Return type:</b> <codeph>decimal</codeph> for <codeph>DECIMAL</codeph> arguments;
+ <codeph>bigint</codeph> for <codeph>DOUBLE</codeph> arguments
+ </p>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+<codeblock>select truncate(3.45)
++----------------+
+| truncate(3.45) |
++----------------+
+| 3 |
++----------------+
+
+select truncate(-3.45)
++-----------------+
+| truncate(-3.45) |
++-----------------+
+| -3 |
++-----------------+
+
+select truncate(3.456,1)
++--------------------+
+| truncate(3.456, 1) |
++--------------------+
+| 3.4 |
++--------------------+
+
+select dtrunc(3.456,1)
++------------------+
+| dtrunc(3.456, 1) |
++------------------+
+| 3.4 |
++------------------+
+
+select truncate(3.456,2)
++--------------------+
+| truncate(3.456, 2) |
++--------------------+
+| 3.45 |
++--------------------+
+
+select truncate(3.456,7)
++--------------------+
+| truncate(3.456, 7) |
++--------------------+
+| 3.4560000 |
++--------------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="unhex">
+
+ <dt>
+ <codeph>unhex(string a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">unhex() function</indexterm>
+ <b>Purpose:</b> Returns a string of characters with ASCII values corresponding to pairs of hexadecimal
+ digits in the argument.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+ </dl>
+ </conbody>
+</concept>
[10/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Posted by jr...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_invalidate_metadata.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_invalidate_metadata.xml b/docs/topics/impala_invalidate_metadata.xml
new file mode 100644
index 0000000..96fca7d
--- /dev/null
+++ b/docs/topics/impala_invalidate_metadata.xml
@@ -0,0 +1,236 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.1" id="invalidate_metadata">
+
+ <title>INVALIDATE METADATA Statement</title>
+ <titlealts><navtitle>INVALIDATE METADATA</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="DDL"/>
+ <data name="Category" value="Metastore"/>
+ <data name="Category" value="Schemas"/>
+ <data name="Category" value="Tables"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">INVALIDATE METADATA statement</indexterm>
+ Marks the metadata for one or all tables as stale. Required after a table is created through the Hive shell,
+ before the table is available for Impala queries. The next time the current Impala node performs a query
+ against a table whose metadata is invalidated, Impala reloads the associated metadata before the query
+ proceeds. This is a relatively expensive operation compared to the incremental metadata update done by the
+ <codeph>REFRESH</codeph> statement, so in the common scenario of adding new data files to an existing table,
+ prefer <codeph>REFRESH</codeph> rather than <codeph>INVALIDATE METADATA</codeph>. If you are not familiar
+ with the way Impala uses metadata and how it shares the same metastore database as Hive, see
+ <xref href="impala_hadoop.xml#intro_metastore"/> for background information.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>INVALIDATE METADATA [[<varname>db_name</varname>.]<varname>table_name</varname>]</codeblock>
+
+ <p>
+ By default, the cached metadata for all tables is flushed. If you specify a table name, only the metadata for
+ that one table is flushed. Even for a single table, <codeph>INVALIDATE METADATA</codeph> is more expensive
+ than <codeph>REFRESH</codeph>, so prefer <codeph>REFRESH</codeph> in the common case where you add new data
+ files for an existing table.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/internals_blurb"/>
+
+ <p>
+ To accurately respond to queries, Impala must have current metadata about those databases and tables that
+ clients query directly. Therefore, if some other entity modifies information used by Impala in the metastore
+ that Impala and Hive share, the information cached by Impala must be updated. However, this does not mean
+ that all metadata updates require an Impala update.
+ </p>
+
+ <note>
+ <p conref="../shared/impala_common.xml#common/catalog_server_124"/>
+ <p rev="1.2">
+ In Impala 1.2 and higher, a dedicated daemon (<cmdname>catalogd</cmdname>) broadcasts DDL changes made
+ through Impala to all Impala nodes. Formerly, after you created a database or table while connected to one
+ Impala node, you needed to issue an <codeph>INVALIDATE METADATA</codeph> statement on another Impala node
+ before accessing the new database or table from the other node. Now, newly created or altered objects are
+ picked up automatically by all Impala nodes. You must still use the <codeph>INVALIDATE METADATA</codeph>
+ technique after creating or altering objects through Hive. See
+ <xref href="impala_components.xml#intro_catalogd"/> for more information on the catalog service.
+ </p>
+ <p>
+ The <codeph>INVALIDATE METADATA</codeph> statement is new in Impala 1.1 and higher, and takes over some of
+ the use cases of the Impala 1.0 <codeph>REFRESH</codeph> statement. Because <codeph>REFRESH</codeph> now
+ requires a table name parameter, to flush the metadata for all tables at once, use the <codeph>INVALIDATE
+ METADATA</codeph> statement.
+ </p>
+ <draft-comment translate="no"> Almost-identical wording here, under INVALIDATE METADATA, and in Release Notes :: New Features. Makes sense to conref. </draft-comment>
+ <p>
+ Because <codeph>REFRESH <varname>table_name</varname></codeph> only works for tables that the current
+ Impala node is already aware of, when you create a new table in the Hive shell, you must enter
+ <codeph>INVALIDATE METADATA</codeph> with no table parameter before you can see the new table in
+ <cmdname>impala-shell</cmdname>. Once the table is known by the Impala node, you can issue <codeph>REFRESH
+ <varname>table_name</varname></codeph> after you add data files for that table.
+ </p>
+ </note>
+
+ <p conref="../shared/impala_common.xml#common/refresh_vs_invalidate"/>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ A metadata update for an <codeph>impalad</codeph> instance <b>is</b> required if:
+ </p>
+
+ <ul>
+ <li>
+ A metadata change occurs.
+ </li>
+
+ <li>
+ <b>and</b> the change is made from another <codeph>impalad</codeph> instance in your cluster, or through
+ Hive.
+ </li>
+
+ <li>
+ <b>and</b> the change is made to a database to which clients such as the Impala shell or ODBC directly
+ connect.
+ </li>
+ </ul>
+
+ <p>
+ A metadata update for an Impala node is <b>not</b> required when you issue queries from the same Impala node
+ where you ran <codeph>ALTER TABLE</codeph>, <codeph>INSERT</codeph>, or other table-modifying statement.
+ </p>
+
+ <p>
+ Database and table metadata is typically modified by:
+ </p>
+
+ <ul>
+ <li>
+ Hive - via <codeph>ALTER</codeph>, <codeph>CREATE</codeph>, <codeph>DROP</codeph> or
+ <codeph>INSERT</codeph> operations.
+ </li>
+
+ <li>
+ Impalad - via <codeph>CREATE TABLE</codeph>, <codeph>ALTER TABLE</codeph>, and <codeph>INSERT</codeph>
+ operations.
+ </li>
+ </ul>
+
+ <p>
+ <codeph>INVALIDATE METADATA</codeph> causes the metadata for that table to be marked as stale, and reloaded
+ the next time the table is referenced. For a huge table, that process could take a noticeable amount of time;
+ thus you might prefer to use <codeph>REFRESH</codeph> where practical, to avoid an unpredictable delay later,
+ for example if the next reference to the table is during a benchmark test.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example shows how you might use the <codeph>INVALIDATE METADATA</codeph> statement after
+ creating new tables (such as SequenceFile or HBase tables) through the Hive shell. Before the
+ <codeph>INVALIDATE METADATA</codeph> statement was issued, Impala would give a <q>table not found</q> error
+ if you tried to refer to those table names. The <codeph>DESCRIBE</codeph> statements cause the latest
+ metadata to be immediately loaded for the tables, avoiding a delay the next time those tables are queried.
+ </p>
+
+<codeblock>[impalad-host:21000] > invalidate metadata;
+[impalad-host:21000] > describe t1;
+...
+[impalad-host:21000] > describe t2;
+... </codeblock>
+
+ <p>
+ For more examples of using <codeph>REFRESH</codeph> and <codeph>INVALIDATE METADATA</codeph> with a
+ combination of Impala and Hive operations, see <xref href="impala_tutorial.xml#tutorial_impala_hive"/>.
+ </p>
+
+ <p>
+ If you need to ensure that the metadata is up-to-date when you start an <cmdname>impala-shell</cmdname>
+ session, run <cmdname>impala-shell</cmdname> with the <codeph>-r</codeph> or
+ <codeph>--refresh_after_connect</codeph> command-line option. Because this operation adds a delay to the next
+ query against each table, potentially expensive for large tables with many partitions, try to avoid using
+ this option for day-to-day operations in a production environment.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
+ <p rev="CDH-19187">
+ The user ID that the <cmdname>impalad</cmdname> daemon runs under,
+ typically the <codeph>impala</codeph> user, must have execute
+ permissions for all the relevant directories holding table data.
+ (A table could have data spread across multiple directories,
+ or in unexpected paths, if it uses partitioning or
+ specifies a <codeph>LOCATION</codeph> attribute for
+ individual partitions or the entire table.)
+ Issues with permissions might not cause an immediate error for this statement,
+ but subsequent statements such as <codeph>SELECT</codeph>
+ or <codeph>SHOW TABLE STATS</codeph> could fail.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/hdfs_blurb"/>
+
+ <p>
+ By default, the <codeph>INVALIDATE METADATA</codeph> command checks HDFS permissions of the underlying data
+ files and directories, caching this information so that a statement can be cancelled immediately if for
+ example the <codeph>impala</codeph> user does not have permission to write to the data directory for the
+ table. (This checking does not apply if you have set the <cmdname>catalogd</cmdname> configuration option
+ <codeph>--load_catalog_in_background=false</codeph>.) Impala reports any lack of write permissions as an
+ <codeph>INFO</codeph> message in the log file, in case that represents an oversight. If you change HDFS
+ permissions to make data readable or writeable by the Impala user, issue another <codeph>INVALIDATE
+ METADATA</codeph> to make Impala aware of the change.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p rev="1.2.4">
+ This example illustrates creating a new database and new table in Hive, then doing an <codeph>INVALIDATE
+ METADATA</codeph> statement in Impala using the fully qualified table name, after which both the new table
+ and the new database are visible to Impala. The ability to specify <codeph>INVALIDATE METADATA
+ <varname>table_name</varname></codeph> for a table created in Hive is a new capability in Impala 1.2.4. In
+ earlier releases, that statement would have returned an error indicating an unknown table, requiring you to
+ do <codeph>INVALIDATE METADATA</codeph> with no table name, a more expensive operation that reloaded metadata
+ for all tables and databases.
+ </p>
+
+<codeblock rev="1.2.4">$ hive
+hive> create database new_db_from_hive;
+OK
+Time taken: 4.118 seconds
+hive> create table new_db_from_hive.new_table_from_hive (x int);
+OK
+Time taken: 0.618 seconds
+hive> quit;
+$ impala-shell
+[localhost:21000] > show databases like 'new*';
+[localhost:21000] > refresh new_db_from_hive.new_table_from_hive;
+ERROR: AnalysisException: Database does not exist: new_db_from_hive
+[localhost:21000] > invalidate metadata new_db_from_hive.new_table_from_hive;
+[localhost:21000] > show databases like 'new*';
++--------------------+
+| name |
++--------------------+
+| new_db_from_hive |
++--------------------+
+[localhost:21000] > show tables in new_db_from_hive;
++---------------------+
+| name |
++---------------------+
+| new_table_from_hive |
++---------------------+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/s3_blurb"/>
+ <p conref="../shared/impala_common.xml#common/s3_metadata"/>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+ <p>
+ <xref href="impala_hadoop.xml#intro_metastore"/>,
+ <xref href="impala_refresh.xml#refresh"/>
+ </p>
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_joins.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_joins.xml b/docs/topics/impala_joins.xml
new file mode 100644
index 0000000..011a488
--- /dev/null
+++ b/docs/topics/impala_joins.xml
@@ -0,0 +1,520 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="joins">
+
+ <title>Joins in Impala SELECT Statements</title>
+ <titlealts><navtitle>Joins</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">joins</indexterm>
+ A join query is a <codeph>SELECT</codeph> statement that combines data from two or more tables,
+ and returns a result set containing items from some or all of those tables. It is a way to
+ cross-reference and correlate related data that is organized into multiple tables, typically
+ using identifiers that are repeated in each of the joined tables.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/join_types"/>
+
+<codeblock>SELECT <varname>select_list</varname> FROM
+ <varname>table_or_subquery1</varname> [INNER] JOIN <varname>table_or_subquery2</varname> |
+ <varname>table_or_subquery1</varname> {LEFT [OUTER] | RIGHT [OUTER] | FULL [OUTER]} JOIN <varname>table_or_subquery2</varname> |
+ <varname>table_or_subquery1</varname> {LEFT | RIGHT} SEMI JOIN <varname>table_or_subquery2</varname> |
+ <ph rev="2.0.0"><varname>table_or_subquery1</varname> {LEFT | RIGHT} ANTI JOIN <varname>table_or_subquery2</varname> |</ph>
+ [ ON <varname>col1</varname> = <varname>col2</varname> [AND <varname>col3</varname> = <varname>col4</varname> ...] |
+ USING (<varname>col1</varname> [, <varname>col2</varname> ...]) ]
+ [<varname>other_join_clause</varname> ...]
+[ WHERE <varname>where_clauses</varname> ]
+
+SELECT <varname>select_list</varname> FROM
+ <varname>table_or_subquery1</varname>, <varname>table_or_subquery2</varname> [, <varname>table_or_subquery3</varname> ...]
+ [<varname>other_join_clause</varname> ...]
+WHERE
+ <varname>col1</varname> = <varname>col2</varname> [AND <varname>col3</varname> = <varname>col4</varname> ...]
+
+SELECT <varname>select_list</varname> FROM
+ <varname>table_or_subquery1</varname> CROSS JOIN <varname>table_or_subquery2</varname>
+ [<varname>other_join_clause</varname> ...]
+[ WHERE <varname>where_clauses</varname> ]</codeblock>
+
+ <p>
+ <b>SQL-92 and SQL-89 Joins:</b>
+ </p>
+
+ <p>
+ Queries with the explicit <codeph>JOIN</codeph> keywords are known as SQL-92 style joins, referring to the
+ level of the SQL standard where they were introduced. The corresponding <codeph>ON</codeph> or
+ <codeph>USING</codeph> clauses clearly show which columns are used as the join keys in each case:
+ </p>
+
+<codeblock>SELECT t1.c1, t2.c2 FROM <b>t1 JOIN t2</b>
+ <b>ON t1.id = t2.id and t1.type_flag = t2.type_flag</b>
+ WHERE t1.c1 > 100;
+
+SELECT t1.c1, t2.c2 FROM <b>t1 JOIN t2</b>
+ <b>USING (id, type_flag)</b>
+ WHERE t1.c1 > 100;</codeblock>
+
+ <p>
+ The <codeph>ON</codeph> clause is a general way to compare columns across the two tables, even if the column
+ names are different. The <codeph>USING</codeph> clause is a shorthand notation for specifying the join
+ columns, when the column names are the same in both tables. You can code equivalent <codeph>WHERE</codeph>
+ clauses that compare the columns, instead of <codeph>ON</codeph> or <codeph>USING</codeph> clauses, but that
+ practice is not recommended because mixing the join comparisons with other filtering clauses is typically
+ less readable and harder to maintain.
+ </p>
+
+ <p>
+ Queries with a comma-separated list of tables and subqueries are known as SQL-89 style joins. In these
+ queries, the equality comparisons between columns of the joined tables go in the <codeph>WHERE</codeph>
+ clause alongside other kinds of comparisons. This syntax is easy to learn, but it is also easy to
+ accidentally remove a <codeph>WHERE</codeph> clause needed for the join to work correctly.
+ </p>
+
+<codeblock>SELECT t1.c1, t2.c2 FROM <b>t1, t2</b>
+ WHERE
+ <b>t1.id = t2.id AND t1.type_flag = t2.type_flag</b>
+ AND t1.c1 > 100;</codeblock>
+
+ <p>
+ <b>Self-joins:</b>
+ </p>
+
+ <p>
+ Impala can do self-joins, for example to join on two different columns in the same table to represent
+ parent-child relationships or other tree-structured data. There is no explicit syntax for this; just use the
+ same table name for both the left-hand and right-hand table, and assign different table aliases to use when
+ referring to the fully qualified column names:
+ </p>
+
+<codeblock>-- Combine fields from both parent and child rows.
+SELECT lhs.id, rhs.parent, lhs.c1, rhs.c2 FROM tree_data lhs, tree_data rhs WHERE lhs.id = rhs.parent;</codeblock>
+
+ <p>
+ <b>Cartesian joins:</b>
+ </p>
+
+ <p>
+ To avoid producing huge result sets by mistake, Impala does not allow Cartesian joins of the form:
+<codeblock>SELECT ... FROM t1 JOIN t2;
+SELECT ... FROM t1, t2;</codeblock>
+ If you intend to join the tables based on common values, add <codeph>ON</codeph> or <codeph>WHERE</codeph>
+ clauses to compare columns across the tables. If you truly intend to do a Cartesian join, use the
+ <codeph>CROSS JOIN</codeph> keyword as the join operator. The <codeph>CROSS JOIN</codeph> form does not use
+ any <codeph>ON</codeph> clause, because it produces a result set with all combinations of rows from the
+ left-hand and right-hand tables. The result set can still be filtered by subsequent <codeph>WHERE</codeph>
+ clauses. For example:
+ </p>
+
+<codeblock>SELECT ... FROM t1 CROSS JOIN t2;
+SELECT ... FROM t1 CROSS JOIN t2 WHERE <varname>tests_on_non_join_columns</varname>;</codeblock>
+
+ <p>
+ <b>Inner and outer joins:</b>
+ </p>
+
+ <p>
+ An inner join is the most common and familiar type: rows in the result set contain the requested columns from
+ the appropriate tables, for all combinations of rows where the join columns of the tables have identical
+ values. If a column with the same name occurs in both tables, use a fully qualified name or a column alias to
+ refer to the column in the select list or other clauses. Impala performs inner joins by default for both
+ SQL-89 and SQL-92 join syntax:
+ </p>
+
+<codeblock>-- The following 3 forms are all equivalent.
+SELECT t1.id, c1, c2 FROM t1, t2 WHERE t1.id = t2.id;
+SELECT t1.id, c1, c2 FROM t1 JOIN t2 ON t1.id = t2.id;
+SELECT t1.id, c1, c2 FROM t1 INNER JOIN t2 ON t1.id = t2.id;</codeblock>
+
+ <p>
+ An outer join retrieves all rows from the left-hand table, or the right-hand table, or both; wherever there
+ is no matching data in the table on the other side of the join, the corresponding columns in the result set
+ are set to <codeph>NULL</codeph>. To perform an outer join, include the <codeph>OUTER</codeph> keyword in the
+ join operator, along with either <codeph>LEFT</codeph>, <codeph>RIGHT</codeph>, or <codeph>FULL</codeph>:
+ </p>
+
+<codeblock>SELECT * FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.id;
+SELECT * FROM t1 RIGHT OUTER JOIN t2 ON t1.id = t2.id;
+SELECT * FROM t1 FULL OUTER JOIN t2 ON t1.id = t2.id;</codeblock>
+
+ <p>
+ For outer joins, Impala requires SQL-92 syntax; that is, the <codeph>JOIN</codeph> keyword instead of
+ comma-separated table names. Impala does not support vendor extensions such as <codeph>(+)</codeph> or
+ <codeph>*=</codeph> notation for doing outer joins with SQL-89 query syntax.
+ </p>
+
+ <p>
+ <b>Equijoins and Non-Equijoins:</b>
+ </p>
+
+ <p>
+ By default, Impala requires an equality comparison between the left-hand and right-hand tables, either
+ through <codeph>ON</codeph>, <codeph>USING</codeph>, or <codeph>WHERE</codeph> clauses. These types of
+ queries are classified broadly as equijoins. Inner, outer, full, and semi joins can all be equijoins based on
+ the presence of equality tests between columns in the left-hand and right-hand tables.
+ </p>
+
+ <p>
+ In Impala 1.2.2 and higher, non-equijoin queries are also possible, with comparisons such as
+ <codeph>!=</codeph> or <codeph><</codeph> between the join columns. These kinds of queries require care to
+ avoid producing huge result sets that could exceed resource limits. Once you have planned a non-equijoin
+ query that produces a result set of acceptable size, you can code the query using the <codeph>CROSS
+ JOIN</codeph> operator, and add the extra comparisons in the <codeph>WHERE</codeph> clause:
+ </p>
+
+<codeblock>SELECT * FROM t1 CROSS JOIN t2 WHERE t1.total > t2.maximum_price;</codeblock>
+
+ <p rev="2.3.0">
+ In CDH 5.5 / Impala 2.3 and higher, additional non-equijoin queries are possible due to the addition
+ of nested loop joins. These queries typically involve <codeph>SEMI JOIN</codeph>,
+ <codeph>ANTI JOIN</codeph>, or <codeph>FULL OUTER JOIN</codeph> clauses.
+ Impala sometimes also uses nested loop joins internally when evaluating <codeph>OUTER JOIN</codeph>
+ queries involving complex type columns.
+ Query phases involving nested loop joins do not use the spill-to-disk mechanism if they
+ exceed the memory limit. Impala decides internally when to use each join mechanism; you cannot
+ specify any query hint to choose between the nested loop join or the original hash join algorithm.
+ </p>
+
+<codeblock rev="2.3.0">SELECT * FROM t1 LEFT OUTER JOIN t2 ON t1.int_col < t2.int_col;</codeblock>
+
+ <p>
+ <b>Semi-joins:</b>
+ </p>
+
+ <p>
+ Semi-joins are a relatively rarely used variation. With the left semi-join, only data from the left-hand
+ table is returned, for rows where there is matching data in the right-hand table, based on comparisons
+ between join columns in <codeph>ON</codeph> or <codeph>WHERE</codeph> clauses. Only one instance of each row
+ from the left-hand table is returned, regardless of how many matching rows exist in the right-hand table.
+ <ph rev="2.0.0">A right semi-join (available in Impala 2.0 and higher) reverses the comparison and returns
+ data from the right-hand table.</ph>
+ </p>
+
+<codeblock>SELECT t1.c1, t1.c2, t1.c2 FROM t1 LEFT SEMI JOIN t2 ON t1.id = t2.id;</codeblock>
+
+ <p>
+ <b>Natural joins (not supported):</b>
+ </p>
+
+ <p>
+ Impala does not support the <codeph>NATURAL JOIN</codeph> operator, again to avoid inconsistent or huge
+ result sets. Natural joins do away with the <codeph>ON</codeph> and <codeph>USING</codeph> clauses, and
+ instead automatically join on all columns with the same names in the left-hand and right-hand tables. This
+ kind of query is not recommended for rapidly evolving data structures such as are typically used in Hadoop.
+ Thus, Impala does not support the <codeph>NATURAL JOIN</codeph> syntax, which can produce different query
+ results as columns are added to or removed from tables.
+ </p>
+
+ <p>
+ If you do have any queries that use <codeph>NATURAL JOIN</codeph>, make sure to rewrite them with explicit
+ <codeph>USING</codeph> clauses, because Impala could interpret the <codeph>NATURAL</codeph> keyword as a
+ table alias:
+ </p>
+
+<codeblock>-- 'NATURAL' is interpreted as an alias for 't1' and Impala attempts an inner join,
+-- resulting in an error because inner joins require explicit comparisons between columns.
+SELECT t1.c1, t2.c2 FROM t1 NATURAL JOIN t2;
+ERROR: NotImplementedException: Join with 't2' requires at least one conjunctive equality predicate.
+ To perform a Cartesian product between two tables, use a CROSS JOIN.
+
+-- If you expect the tables to have identically named columns with matching values,
+-- list the corresponding column names in a USING clause.
+SELECT t1.c1, t2.c2 FROM t1 JOIN t2 USING (id, type_flag, name, address);</codeblock>
+
+ <p rev="2.0.0">
+ <b>Anti-joins (Impala 2.0 / CDH 5.2 and higher only):</b>
+ </p>
+
+ <p rev="2.0.0">
+ Impala supports the <codeph>LEFT ANTI JOIN</codeph> and <codeph>RIGHT ANTI JOIN</codeph> clauses in Impala
+ 2.0 and higher on CDH 4, or CDH 5.2 and higher on CDH 5. The <codeph>LEFT</codeph> or <codeph>RIGHT</codeph>
+ keyword is required for this kind of join. For <codeph>LEFT ANTI JOIN</codeph>, this clause returns those
+ values from the left-hand table that have no matching value in the right-hand table. <codeph>RIGHT ANTI
+ JOIN</codeph> reverses the comparison and returns values from the right-hand table. You can express this
+ negative relationship either through the <codeph>ANTI JOIN</codeph> clause or through a <codeph>NOT
+ EXISTS</codeph> operator with a subquery.
+ </p>
+
+<!-- Restriction lifted in Impala 2.0.
+<p>
+Impala does not support <codeph>WHERE</codeph> clauses
+such as <codeph>IN (<varname>subquery</varname>)</codeph>,
+<codeph>NOT IN (<varname>subquery</varname>)</codeph>,
+<codeph>EXISTS (<varname>subquery</varname>)</codeph>,
+and <codeph>NOT EXISTS (<varname>subquery</varname>)</codeph>.
+Therefore from a practical standpoint, you cannot
+express an anti-join condition, where values from one table
+are returned only if no matching values are present in another table.
+</p>
+-->
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+<!-- To do: reuse some complex types examples with joins here or under Examples farther down. -->
+
+ <p rev="2.3.0">
+ When referring to a column with a complex type (<codeph>STRUCT</codeph>, <codeph>ARRAY</codeph>, or <codeph>MAP</codeph>)
+ in a query, you use join notation to <q>unpack</q> the scalar fields of the struct, the elements of the array, or
+ the key-value pairs of the map. (The join notation is not required for aggregation operations, such as
+ <codeph>COUNT()</codeph> or <codeph>SUM()</codeph> for array elements.) Because Impala recognizes which complex type elements are associated with which row
+ of the result set, you use the same syntax as for a cross or cartesian join, without an explicit join condition.
+ See <xref href="impala_complex_types.xml#complex_types"/> for details about Impala support for complex types.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ You typically use join queries in situations like these:
+ </p>
+
+ <ul>
+ <li>
+ When related data arrives from different sources, with each data set physically residing in a separate
+ table. For example, you might have address data from business records that you cross-check against phone
+ listings or census data.
+ <note>
+ Impala can join tables of different file formats, including Impala-managed tables and HBase tables. For
+ example, you might keep small dimension tables in HBase, for convenience of single-row lookups and
+ updates, and for the larger fact tables use Parquet or other binary file format optimized for scan
+ operations. Then, you can issue a join query to cross-reference the fact tables with the dimension
+ tables.
+ </note>
+ </li>
+
+ <li>
+ When data is normalized, a technique for reducing data duplication by dividing it across multiple tables.
+ This kind of organization is often found in data that comes from traditional relational database systems.
+ For example, instead of repeating some long string such as a customer name in multiple tables, each table
+ might contain a numeric customer ID. Queries that need to display the customer name could <q>join</q> the
+ table that specifies which customer ID corresponds to which name.
+ </li>
+
+ <li>
+ When certain columns are rarely needed for queries, so they are moved into separate tables to reduce
+ overhead for common queries. For example, a <codeph>biography</codeph> field might be rarely needed in
+ queries on employee data. Putting that field in a separate table reduces the amount of I/O for common
+ queries on employee addresses or phone numbers. Queries that do need the <codeph>biography</codeph> column
+ can retrieve it by performing a join with that separate table.
+ </li>
+
+ <li>
+ In CDH 5.5 / Impala 2.3 or higher, when referring to complex type columns in queries.
+ See <xref href="impala_complex_types.xml#complex_types"/> for details.
+ </li>
+ </ul>
+
+ <p>
+ When comparing columns with the same names in <codeph>ON</codeph> or <codeph>WHERE</codeph> clauses, use the
+ fully qualified names such as <codeph><varname>db_name</varname>.<varname>table_name</varname></codeph>, or
+ assign table aliases, column aliases, or both to make the code more compact and understandable:
+ </p>
+
+<codeblock>select t1.c1 as first_id, t2.c2 as second_id from
+ t1 join t2 on first_id = second_id;
+
+select fact.custno, dimension.custno from
+ customer_data as fact join customer_address as dimension
+ using (custno)</codeblock>
+
+ <note>
+ <p>
+ Performance for join queries is a crucial aspect for Impala, because complex join queries are
+ resource-intensive operations. An efficient join query produces much less network traffic and CPU overhead
+ than an inefficient one. For best results:
+ </p>
+ <ul>
+ <li rev="1.2">
+ Make sure that both <xref href="impala_perf_stats.xml#perf_stats">table and column statistics</xref> are
+ available for all the tables involved in a join query, and especially for the columns referenced in any
+ join conditions. Impala uses the statistics to automatically deduce an efficient join order.
+ Use <xref href="impala_show.xml#show"><codeph>SHOW TABLE STATS <varname>table_name</varname></codeph> and
+ <codeph>SHOW COLUMN STATS <varname>table_name</varname></codeph></xref> to check if statistics are
+ already present. Issue the <codeph>COMPUTE STATS <varname>table_name</varname></codeph> for a nonpartitioned table,
+ or (in Impala 2.1.0 and higher) <codeph>COMPUTE INCREMENTAL STATS <varname>table_name</varname></codeph>
+ for a partitioned table, to collect the initial statistics at both the table and column levels, and to keep the
+ statistics up to date after any substantial <codeph>INSERT</codeph> or <codeph>LOAD DATA</codeph> operations.
+ </li>
+
+ <li rev="1.2">
+ If table or column statistics are not available, join the largest table first. You can check the
+ existence of statistics with the <codeph>SHOW TABLE STATS <varname>table_name</varname></codeph> and
+ <codeph>SHOW COLUMN STATS <varname>table_name</varname></codeph> statements.
+ </li>
+
+ <li rev="1.2.2">
+ If table or column statistics are not available, join subsequent tables according to which table has the
+ most selective filter, based on overall size and <codeph>WHERE</codeph> clauses. Joining the table with
+ the most selective filter results in the fewest number of rows being returned.
+ </li>
+ </ul>
+ <p>
+ For more information and examples of performance for join queries, see
+ <xref href="impala_perf_joins.xml#perf_joins"/>.
+ </p>
+ </note>
+
+ <p>
+ To control the result set from a join query, include the names of corresponding column names in both tables
+ in an <codeph>ON</codeph> or <codeph>USING</codeph> clause, or by coding equality comparisons for those
+ columns in the <codeph>WHERE</codeph> clause.
+ </p>
+
+<codeblock>[localhost:21000] > select c_last_name, ca_city from customer join customer_address where c_customer_sk = ca_address_sk;
++-------------+-----------------+
+| c_last_name | ca_city |
++-------------+-----------------+
+| Lewis | Fairfield |
+| Moses | Fairview |
+| Hamilton | Pleasant Valley |
+| White | Oak Ridge |
+| Moran | Glendale |
+...
+| Richards | Lakewood |
+| Day | Lebanon |
+| Painter | Oak Hill |
+| Bentley | Greenfield |
+| Jones | Stringtown |
++-------------+------------------+
+Returned 50000 row(s) in 9.82s</codeblock>
+
+ <p>
+ One potential downside of joins is the possibility of excess resource usage in poorly constructed queries.
+ Impala imposes restrictions on join queries to guard against such issues. To minimize the chance of runaway
+ queries on large data sets, Impala requires every join query to contain at least one equality predicate
+ between the columns of the various tables. For example, if <codeph>T1</codeph> contains 1000 rows and
+ <codeph>T2</codeph> contains 1,000,000 rows, a query <codeph>SELECT <varname>columns</varname> FROM t1 JOIN
+ t2</codeph> could return up to 1 billion rows (1000 * 1,000,000); Impala requires that the query include a
+ clause such as <codeph>ON t1.c1 = t2.c2</codeph> or <codeph>WHERE t1.c1 = t2.c2</codeph>.
+ </p>
+
+ <p>
+ Because even with equality clauses, the result set can still be large, as we saw in the previous example, you
+ might use a <codeph>LIMIT</codeph> clause to return a subset of the results:
+ </p>
+
+<codeblock>[localhost:21000] > select c_last_name, ca_city from customer, customer_address where c_customer_sk = ca_address_sk limit 10;
++-------------+-----------------+
+| c_last_name | ca_city |
++-------------+-----------------+
+| Lewis | Fairfield |
+| Moses | Fairview |
+| Hamilton | Pleasant Valley |
+| White | Oak Ridge |
+| Moran | Glendale |
+| Sharp | Lakeview |
+| Wiles | Farmington |
+| Shipman | Union |
+| Gilbert | New Hope |
+| Brunson | Martinsville |
++-------------+-----------------+
+Returned 10 row(s) in 0.63s</codeblock>
+
+ <p>
+ Or you might use additional comparison operators or aggregation functions to condense a large result set into
+ a smaller set of values:
+ </p>
+
+<codeblock>[localhost:21000] > -- Find the names of customers who live in one particular town.
+[localhost:21000] > select distinct c_last_name from customer, customer_address where
+ c_customer_sk = ca_address_sk
+ and ca_city = "Green Acres";
++---------------+
+| c_last_name |
++---------------+
+| Hensley |
+| Pearson |
+| Mayer |
+| Montgomery |
+| Ricks |
+...
+| Barrett |
+| Price |
+| Hill |
+| Hansen |
+| Meeks |
++---------------+
+Returned 332 row(s) in 0.97s
+
+[localhost:21000] > -- See how many different customers in this town have names starting with "A".
+[localhost:21000] > select count(distinct c_last_name) from customer, customer_address where
+ c_customer_sk = ca_address_sk
+ and ca_city = "Green Acres"
+ and substr(c_last_name,1,1) = "A";
++-----------------------------+
+| count(distinct c_last_name) |
++-----------------------------+
+| 12 |
++-----------------------------+
+Returned 1 row(s) in 1.00s</codeblock>
+
+ <p>
+ Because a join query can involve reading large amounts of data from disk, sending large amounts of data
+ across the network, and loading large amounts of data into memory to do the comparisons and filtering, you
+ might do benchmarking, performance analysis, and query tuning to find the most efficient join queries for
+ your data set, hardware capacity, network configuration, and cluster workload.
+ </p>
+
+ <p>
+ The two categories of joins in Impala are known as <b>partitioned joins</b> and <b>broadcast joins</b>. If
+ inaccurate table or column statistics, or some quirk of the data distribution, causes Impala to choose the
+ wrong mechanism for a particular join, consider using query hints as a temporary workaround. For details, see
+ <xref href="impala_hints.xml#hints"/>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following examples refer to these simple tables containing small sets of integers:
+<codeblock>[localhost:21000] > create table t1 (x int);
+[localhost:21000] > insert into t1 values (1), (2), (3), (4), (5), (6);
+
+[localhost:21000] > create table t2 (y int);
+[localhost:21000] > insert into t2 values (2), (4), (6);
+
+[localhost:21000] > create table t3 (z int);
+[localhost:21000] > insert into t3 values (1), (3), (5);
+</codeblock>
+ </p>
+
+<!-- To do: fill in examples for other join types. -->
+
+ <p>
+ The following example demonstrates an anti-join, returning the values from <codeph>T1</codeph> that do not
+ exist in <codeph>T2</codeph> (in this case, the odd numbers 1, 3, and 5):
+ </p>
+
+<codeblock>[localhost:21000] > select x from t1 left anti join t2 on (t1.x = t2.y);
++---+
+| x |
++---+
+| 1 |
+| 3 |
+| 5 |
++---+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ See these tutorials for examples of different kinds of joins:
+ </p>
+
+ <ul>
+ <li>
+ <xref href="impala_tutorial.xml#tut_cross_join"/>
+ </li>
+ </ul>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_langref.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_langref.xml b/docs/topics/impala_langref.xml
new file mode 100644
index 0000000..aaa76aa
--- /dev/null
+++ b/docs/topics/impala_langref.xml
@@ -0,0 +1,179 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="langref">
+
+ <title><ph audience="PDF">Impala SQL Language Reference</ph><ph audience="HTML">Overview of Impala SQL</ph></title>
+
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="impala-shell"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ Impala uses SQL as its query language. Impala interprets SQL statements and performs the
+ full end-to-end processing for each statement. (As opposed to acting as a translation
+ layer for some other Hadoop subsystem.)
+ </p>
+
+ <p>
+ Impala implements many familiar statements, such as <codeph>CREATE TABLE</codeph>,
+ <codeph>INSERT</codeph>, and <codeph>SELECT</codeph>. Currently, the DML statements
+ <codeph>UPDATE</codeph> and <codeph>DELETE</codeph> are not available in the production
+ level of Impala, because big data analytics with Hadoop and HDFS typically involves
+ unchanging data. <codeph>UPDATE</codeph> and <codeph>DELETE</codeph> <i>are</i> available
+ in beta form in the version of Impala used with the Kudu storage layer. For full details
+ about Impala SQL syntax and semantics, see
+ <xref href="impala_langref_sql.xml#langref_sql"/>.
+ </p>
+
+ <p>
+ Queries include clauses such as <codeph>WHERE</codeph>, <codeph>GROUP BY</codeph>,
+ <codeph>ORDER BY</codeph>, and <codeph>JOIN</codeph>. For information about query syntax,
+ see <xref href="impala_select.xml#select"/>.
+ </p>
+
+ <p>
+ Queries can also include function calls, to scalar functions such as
+ <codeph>sin()</codeph> and <codeph>substr()</codeph>, aggregate functions such as
+ <codeph>count()</codeph> and <codeph>avg()</codeph>, and analytic functions such as
+ <codeph>lag()</codeph> and <codeph>rank()</codeph>. For a list of the built-in functions
+ available in Impala queries, see <xref href="impala_functions.xml#builtins"/>.
+ </p>
+
+ <p outputclass="toc"/>
+
+ </conbody>
+
+ <concept id="langref_performance">
+
+ <title>Performance Features</title>
+
+ <conbody>
+
+ <p>
+ The main performance-related SQL features for Impala are:
+ </p>
+
+ <ul>
+ <li>
+ <p>
+ The <codeph>COMPUTE STATS</codeph> statement, and the underlying table statistics
+ and column statistics used in query planning. The statistics are used to estimate
+ the number of rows and size of the result set for queries, subqueries, and the
+ different <q>sides</q> of a join query.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ The output of the <codeph>EXPLAIN</codeph> statement. It outlines the ways in which
+ the query is parallelized, and how much I/O, memory, and so on the query expects to
+ use. You can control the level of detail in the output through a query option.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Partitioning for tables. By organizing the data for efficient access along one or
+ more dimensions, this technique lets queries read only the relevant data.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Query hints, especially for join queries. Impala selects from different join
+ algorithms based on the relative sizes of the result sets for each side of the join.
+ In cases where you know the most effective technique for a particular query, you can
+ override the estimates that Impala uses to make that choice, and select the join
+ technique directly.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Query options. These options control settings that can influence the performance of
+ individual queries when you know the special considerations based on your workload,
+ hardware configuration, or data distribution.
+ </p>
+ </li>
+ </ul>
+
+ <p>
+ Because analytic queries against high volumes of data tend to require full scans against
+ large portions of data from each table, Impala does not include index-related SQL
+ statements such as <codeph>CREATE INDEX</codeph>. The <codeph>COMPUTE STATS</codeph>
+ serves the purpose of analyzing the distribution of data within each column and the
+ overall table. Partitioning optimizes the physical layout of the data for queries that
+ filter on one or more crucial columns.
+ </p>
+
+ </conbody>
+
+ </concept>
+
+ <concept id="hive_interoperability">
+
+ <title>Sharing Tables, Data, and Queries Between Impala and Hive</title>
+
+ <conbody>
+
+ <p>
+ To protect user investment in skills development and query design, Impala provides a
+ high degree of compatibility with the Hive Query Language (HiveQL):
+ </p>
+
+ <ul>
+ <li>
+ Because Impala uses the same metadata store as Hive to record information about table
+ structure and properties, Impala can access tables defined through the native Impala
+ <codeph>CREATE TABLE</codeph> command, or tables created using the Hive data
+ definition language (DDL).
+ </li>
+
+ <li>
+ Impala supports data manipulation (DML) statements similar to the DML component of
+ HiveQL.
+ </li>
+
+ <li>
+ Impala provides many <xref href="impala_functions.xml#builtins">built-in
+ functions</xref> with the same names and parameter types as their HiveQL equivalents.
+ </li>
+ </ul>
+
+ <p>
+ Impala supports most of the same
+ <xref href="impala_langref_sql.xml#langref_sql">statements and clauses</xref> as HiveQL,
+ including, but not limited to <codeph>JOIN</codeph>, <codeph>AGGREGATE</codeph>,
+ <codeph>DISTINCT</codeph>, <codeph>UNION ALL</codeph>, <codeph>ORDER BY</codeph>,
+ <codeph>LIMIT</codeph> and (uncorrelated) subquery in the <codeph>FROM</codeph> clause.
+ Impala also supports <codeph>INSERT INTO</codeph> and <codeph>INSERT OVERWRITE</codeph>.
+ </p>
+
+ <p>
+ Impala supports data types with the same names and semantics as the equivalent Hive data
+ types: <codeph>STRING</codeph>, <codeph>TINYINT</codeph>, <codeph>SMALLINT</codeph>,
+ <codeph>INT</codeph>, <codeph>BIGINT</codeph>, <codeph>FLOAT</codeph>,
+ <codeph>DOUBLE</codeph>, <codeph>BOOLEAN</codeph>, <codeph>STRING</codeph>,
+ <codeph>TIMESTAMP</codeph>. CDH 5.5 / Impala 2.3 and higher also include the complex
+ types <codeph>ARRAY</codeph>, <codeph>STRUCT</codeph>, and <codeph>MAP</codeph>.
+ </p>
+
+ <p>
+ Most HiveQL <codeph>SELECT</codeph> and <codeph>INSERT</codeph> statements run
+ unmodified with Impala. For information about Hive syntax not available in Impala, see
+ <xref href="impala_langref_unsupported.xml#langref_hiveql_delta"/>.
+ </p>
+
+ </conbody>
+
+ </concept>
+
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_langref_sql.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_langref_sql.xml b/docs/topics/impala_langref_sql.xml
new file mode 100644
index 0000000..d759e76
--- /dev/null
+++ b/docs/topics/impala_langref_sql.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="langref_sql">
+
+ <title>Impala SQL Statements</title>
+ <titlealts><navtitle>SQL Statements</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Data Analysts"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ The Impala SQL dialect supports a range of standard elements, plus some extensions for Big Data use cases
+ related to data loading and data warehousing.
+ </p>
+
+ <note>
+ <p>
+ In the <cmdname>impala-shell</cmdname> interpreter, a semicolon at the end of each statement is required.
+ Since the semicolon is not actually part of the SQL syntax, we do not include it in the syntax definition
+ of each statement, but we do show it in examples intended to be run in <cmdname>impala-shell</cmdname>.
+ </p>
+ </note>
+
+ <p audience="PDF" outputclass="toc all">
+ The following sections show the major SQL statements that you work with in Impala:
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_langref_unsupported.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_langref_unsupported.xml b/docs/topics/impala_langref_unsupported.xml
new file mode 100644
index 0000000..f2b0560
--- /dev/null
+++ b/docs/topics/impala_langref_unsupported.xml
@@ -0,0 +1,296 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="langref_hiveql_delta">
+
+ <title>SQL Differences Between Impala and Hive</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Hive"/>
+ <data name="Category" value="Porting"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">Hive</indexterm>
+ <indexterm audience="Cloudera">HiveQL</indexterm>
+ Impala's SQL syntax follows the SQL-92 standard, and includes many industry extensions in areas such as
+ built-in functions. See <xref href="impala_porting.xml#porting"/> for a general discussion of adapting SQL
+ code from a variety of database systems to Impala.
+ </p>
+
+ <p>
+ Because Impala and Hive share the same metastore database and their tables are often used interchangeably,
+ the following section covers differences between Impala and Hive in detail.
+ </p>
+
+ <p outputclass="toc inpage"/>
+ </conbody>
+
+ <concept id="langref_hiveql_unsupported">
+
+ <title>HiveQL Features not Available in Impala</title>
+
+ <conbody>
+
+ <p>
+ The current release of Impala does not support the following SQL features that you might be familiar with
+ from HiveQL:
+ </p>
+
+ <draft-comment translate="no">
+Yeesh, too many separate lists of unsupported Hive syntax.
+Here, the FAQ, and in some of the intro topics.
+Some discussion in IMP-1061 about how best to reorg.
+Lots of opportunities for conrefs.
+</draft-comment>
+
+ <ul>
+<!-- Now supported in CDH 5.5 / Impala 2.3 and higher. Find places on this page (like already done under lateral views) to note the new data type support.
+ <li>
+ Non-scalar data types such as maps, arrays, structs.
+ </li>
+-->
+
+ <li rev="1.2">
+ Extensibility mechanisms such as <codeph>TRANSFORM</codeph>, custom file formats, or custom SerDes.
+ </li>
+
+ <li>
+ XML and JSON functions.
+ </li>
+
+ <li>
+ Certain aggregate functions from HiveQL: <codeph>covar_pop</codeph>, <codeph>covar_samp</codeph>,
+ <codeph>corr</codeph>, <codeph>percentile</codeph>, <codeph>percentile_approx</codeph>,
+ <codeph>histogram_numeric</codeph>, <codeph>collect_set</codeph>; Impala supports the set of aggregate
+ functions listed in <xref href="impala_aggregate_functions.xml#aggregate_functions"/> and analytic
+ functions listed in <xref href="impala_analytic_functions.xml#analytic_functions"/>.
+ </li>
+
+ <li>
+ Sampling.
+ </li>
+
+ <li>
+ Lateral views. In CDH 5.5 / Impala 2.3 and higher, Impala supports queries on complex types
+ (<codeph>STRUCT</codeph>, <codeph>ARRAY</codeph>, or <codeph>MAP</codeph>), using join notation
+ rather than the <codeph>EXPLODE()</codeph> keyword.
+ See <xref href="impala_complex_types.xml#complex_types"/> for details about Impala support for complex types.
+ </li>
+
+ <li>
+ Multiple <codeph>DISTINCT</codeph> clauses per query, although Impala includes some workarounds for this
+ limitation.
+ <note conref="../shared/impala_common.xml#common/multiple_count_distinct"/>
+ </li>
+ </ul>
+
+ <p>
+ User-defined functions (UDFs) are supported starting in Impala 1.2. See <xref href="impala_udf.xml#udfs"/>
+ for full details on Impala UDFs.
+ <ul>
+ <li>
+ Impala supports high-performance UDFs written in C++, as well as reusing some Java-based Hive UDFs.
+ </li>
+
+ <li>
+ Impala supports scalar UDFs and user-defined aggregate functions (UDAFs). Impala does not currently
+ support user-defined table generating functions (UDTFs).
+ </li>
+
+ <li>
+ Only Impala-supported column types are supported in Java-based UDFs.
+ </li>
+ </ul>
+ </p>
+
+ <p>
+ Impala does not currently support these HiveQL statements:
+ </p>
+
+ <ul>
+ <li>
+ <codeph>ANALYZE TABLE</codeph> (the Impala equivalent is <codeph>COMPUTE STATS</codeph>)
+ </li>
+
+ <li>
+ <codeph>DESCRIBE COLUMN</codeph>
+ </li>
+
+ <li>
+ <codeph>DESCRIBE DATABASE</codeph>
+ </li>
+
+ <li>
+ <codeph>EXPORT TABLE</codeph>
+ </li>
+
+ <li>
+ <codeph>IMPORT TABLE</codeph>
+ </li>
+
+ <li>
+ <codeph>SHOW TABLE EXTENDED</codeph>
+ </li>
+
+ <li>
+ <codeph>SHOW INDEXES</codeph>
+ </li>
+
+ <li>
+ <codeph>SHOW COLUMNS</codeph>
+ </li>
+ </ul>
+ </conbody>
+ </concept>
+
+ <concept id="langref_hiveql_semantics">
+
+ <title>Semantic Differences Between Impala and HiveQL Features</title>
+
+ <conbody>
+
+ <p>
+ This section covers instances where Impala and Hive have similar functionality, sometimes including the
+ same syntax, but there are differences in the runtime semantics of those features.
+ </p>
+
+ <p>
+ <b>Security:</b>
+ </p>
+
+ <p>
+ Impala utilizes the <xref href="http://sentry.incubator.apache.org/" scope="external" format="html">Apache
+ Sentry (incubating)</xref> authorization framework, which provides fine-grained role-based access control
+ to protect data against unauthorized access or tampering.
+ </p>
+
+ <p>
+ The Hive component included in CDH 5.1 and higher now includes Sentry-enabled <codeph>GRANT</codeph>,
+ <codeph>REVOKE</codeph>, and <codeph>CREATE/DROP ROLE</codeph> statements. Earlier Hive releases had a
+ privilege system with <codeph>GRANT</codeph> and <codeph>REVOKE</codeph> statements that were primarily
+ intended to prevent accidental deletion of data, rather than a security mechanism to protect against
+ malicious users.
+ </p>
+
+ <p>
+ Impala can make use of privileges set up through Hive <codeph>GRANT</codeph> and <codeph>REVOKE</codeph> statements.
+ Impala has its own <codeph>GRANT</codeph> and <codeph>REVOKE</codeph> statements in Impala 2.0 and higher.
+ See <xref href="impala_authorization.xml#authorization"/> for the details of authorization in Impala, including
+ how to switch from the original policy file-based privilege model to the Sentry service using privileges
+ stored in the metastore database.
+ </p>
+
+ <p>
+ <b>SQL statements and clauses:</b>
+ </p>
+
+ <p>
+ The semantics of Impala SQL statements varies from HiveQL in some cases where they use similar SQL
+ statement and clause names:
+ </p>
+
+ <ul>
+ <li>
+ Impala uses different syntax and names for query hints, <codeph>[SHUFFLE]</codeph> and
+ <codeph>[NOSHUFFLE]</codeph> rather than <codeph>MapJoin</codeph> or <codeph>StreamJoin</codeph>. See
+ <xref href="impala_joins.xml#joins"/> for the Impala details.
+ </li>
+
+ <li>
+ Impala does not expose MapReduce specific features of <codeph>SORT BY</codeph>, <codeph>DISTRIBUTE
+ BY</codeph>, or <codeph>CLUSTER BY</codeph>.
+ </li>
+
+ <li>
+ Impala does not require queries to include a <codeph>FROM</codeph> clause.
+ </li>
+ </ul>
+
+ <p>
+ <b>Data types:</b>
+ </p>
+
+ <ul>
+ <li>
+ Impala supports a limited set of implicit casts. This can help avoid undesired results from unexpected
+ casting behavior.
+ <ul>
+ <li>
+ Impala does not implicitly cast between string and numeric or Boolean types. Always use
+ <codeph>CAST()</codeph> for these conversions.
+ </li>
+
+ <li>
+ Impala does perform implicit casts among the numeric types, when going from a smaller or less precise
+ type to a larger or more precise one. For example, Impala will implicitly convert a
+ <codeph>SMALLINT</codeph> to a <codeph>BIGINT</codeph> or <codeph>FLOAT</codeph>, but to convert from
+ <codeph>DOUBLE</codeph> to <codeph>FLOAT</codeph> or <codeph>INT</codeph> to <codeph>TINYINT</codeph>
+ requires a call to <codeph>CAST()</codeph> in the query.
+ </li>
+
+ <li>
+ Impala does perform implicit casts from string to timestamp. Impala has a restricted set of literal
+ formats for the <codeph>TIMESTAMP</codeph> data type and the <codeph>from_unixtime()</codeph> format
+ string; see <xref href="impala_timestamp.xml#timestamp"/> for details.
+ </li>
+ </ul>
+ <p>
+ See <xref href="impala_datatypes.xml#datatypes"/> for full details on implicit and explicit casting for
+ all types, and <xref href="impala_conversion_functions.xml#conversion_functions"/> for details about
+ the <codeph>CAST()</codeph> function.
+ </p>
+ </li>
+
+ <li>
+ Impala does not store or interpret timestamps using the local timezone, to avoid undesired results from
+ unexpected time zone issues. Timestamps are stored and interpreted relative to UTC. This difference can
+ produce different results for some calls to similarly named date/time functions between Impala and Hive.
+ See <xref href="impala_datetime_functions.xml#datetime_functions"/> for details about the Impala
+ functions. See <xref href="impala_timestamp.xml#timestamp"/> for a discussion of how Impala handles
+ time zones, and configuration options you can use to make Impala match the Hive behavior more closely
+ when dealing with Parquet-encoded <codeph>TIMESTAMP</codeph> data or when converting between
+ the local time zone and UTC.
+ </li>
+
+ <li>
+ The Impala <codeph>TIMESTAMP</codeph> type can represent dates ranging from 1400-01-01 to 9999-12-31.
+ This is different from the Hive date range, which is 0000-01-01 to 9999-12-31.
+ </li>
+
+ <li>
+ Impala does not return column overflows as <codeph>NULL</codeph>, so that customers can distinguish
+ between <codeph>NULL</codeph> data and overflow conditions similar to how they do so with traditional
+ database systems. Impala returns the largest or smallest value in the range for the type. For example,
+ valid values for a <codeph>tinyint</codeph> range from -128 to 127. In Impala, a <codeph>tinyint</codeph>
+ with a value of -200 returns -128 rather than <codeph>NULL</codeph>. A <codeph>tinyint</codeph> with a
+ value of 200 returns 127.
+ </li>
+ </ul>
+
+ <p>
+ <b>Miscellaneous features:</b>
+ </p>
+
+ <ul>
+ <li>
+ Impala does not provide virtual columns.
+ </li>
+
+ <li>
+ Impala does not expose locking.
+ </li>
+
+ <li>
+ Impala does not expose some configuration properties.
+ </li>
+ </ul>
+ </conbody>
+ </concept>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_limit.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_limit.xml b/docs/topics/impala_limit.xml
new file mode 100644
index 0000000..c186cd4
--- /dev/null
+++ b/docs/topics/impala_limit.xml
@@ -0,0 +1,149 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="limit">
+
+ <title>LIMIT Clause</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Reports"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ The <codeph>LIMIT</codeph> clause in a <codeph>SELECT</codeph> query sets a maximum number of rows for the
+ result set. Pre-selecting the maximum size of the result set helps Impala to optimize memory usage while
+ processing a distributed query.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>LIMIT <varname>constant_integer_expression</varname></codeblock>
+
+ <p>
+ The argument to the <codeph>LIMIT</codeph> clause must evaluate to a constant value. It can be a numeric
+ literal, or another kind of numeric expression involving operators, casts, and function return values. You
+ cannot refer to a column or use a subquery.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ This clause is useful in contexts such as:
+ </p>
+
+ <ul>
+ <li>
+ To return exactly N items from a top-N query, such as the 10 highest-rated items in a shopping category or
+ the 50 hostnames that refer the most traffic to a web site.
+ </li>
+
+ <li>
+ To demonstrate some sample values from a table or a particular query. (To display some arbitrary items, use
+ a query with no <codeph>ORDER BY</codeph> clause. An <codeph>ORDER BY</codeph> clause causes additional
+ memory and/or disk usage during the query.)
+ </li>
+
+ <li>
+ To keep queries from returning huge result sets by accident if a table is larger than expected, or a
+ <codeph>WHERE</codeph> clause matches more rows than expected.
+ </li>
+ </ul>
+
+ <p rev="1.2.1">
+ Originally, the value for the <codeph>LIMIT</codeph> clause had to be a numeric literal. In Impala 1.2.1 and
+ higher, it can be a numeric expression.
+ </p>
+
+ <p rev="obwl" conref="../shared/impala_common.xml#common/order_by_limit"/>
+
+ <p>
+ See <xref href="impala_order_by.xml#order_by"/> for details.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/limit_and_offset"/>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/subquery_no_limit"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example shows how the <codeph>LIMIT</codeph> clause caps the size of the result set, with the
+ limit being applied after any other clauses such as <codeph>WHERE</codeph>.
+ </p>
+
+<codeblock>[localhost:21000] > create database limits;
+[localhost:21000] > use limits;
+[localhost:21000] > create table numbers (x int);
+[localhost:21000] > insert into numbers values (1), (3), (4), (5), (2);
+Inserted 5 rows in 1.34s
+[localhost:21000] > select x from numbers limit 100;
++---+
+| x |
++---+
+| 1 |
+| 3 |
+| 4 |
+| 5 |
+| 2 |
++---+
+Returned 5 row(s) in 0.26s
+[localhost:21000] > select x from numbers limit 3;
++---+
+| x |
++---+
+| 1 |
+| 3 |
+| 4 |
++---+
+Returned 3 row(s) in 0.27s
+[localhost:21000] > select x from numbers where x > 2 limit 2;
++---+
+| x |
++---+
+| 3 |
+| 4 |
++---+
+Returned 2 row(s) in 0.27s</codeblock>
+
+ <p>
+ For top-N and bottom-N queries, you use the <codeph>ORDER BY</codeph> and <codeph>LIMIT</codeph> clauses
+ together:
+ </p>
+
+<codeblock rev="obwl">[localhost:21000] > select x as "Top 3" from numbers order by x desc limit 3;
++-------+
+| top 3 |
++-------+
+| 5 |
+| 4 |
+| 3 |
++-------+
+[localhost:21000] > select x as "Bottom 3" from numbers order by x limit 3;
++----------+
+| bottom 3 |
++----------+
+| 1 |
+| 2 |
+| 3 |
++----------+
+</codeblock>
+
+ <p>
+ You can use constant values besides integer literals as the <codeph>LIMIT</codeph> argument:
+ </p>
+
+<codeblock>-- Other expressions that yield constant integer values work too.
+SELECT x FROM t1 LIMIT 1e6; -- Limit is one million.
+SELECT x FROM t1 LIMIT length('hello world'); -- Limit is 11.
+SELECT x FROM t1 LIMIT 2+2; -- Limit is 4.
+SELECT x FROM t1 LIMIT cast(truncate(9.9) AS INT); -- Limit is 9.
+</codeblock>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_literals.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_literals.xml b/docs/topics/impala_literals.xml
new file mode 100644
index 0000000..3c53796
--- /dev/null
+++ b/docs/topics/impala_literals.xml
@@ -0,0 +1,384 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="literals">
+
+ <title>Literals</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Data Types"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">literals</indexterm>
+ Each of the Impala data types has corresponding notation for literal values of that type. You specify literal
+ values in SQL statements, such as in the <codeph>SELECT</codeph> list or <codeph>WHERE</codeph> clause of a
+ query, or as an argument to a function call. See <xref href="impala_datatypes.xml#datatypes"/> for a complete
+ list of types, ranges, and conversion rules.
+ </p>
+
+ <p outputclass="toc inpage"/>
+ </conbody>
+
+ <concept id="numeric_literals">
+
+ <title>Numeric Literals</title>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">numeric literals</indexterm>
+ To write literals for the integer types (<codeph>TINYINT</codeph>, <codeph>SMALLINT</codeph>,
+ <codeph>INT</codeph>, and <codeph>BIGINT</codeph>), use a sequence of digits with optional leading zeros.
+ </p>
+
+ <p rev="1.4.0">
+ To write literals for the floating-point types (<codeph rev="1.4.0">DECIMAL</codeph>,
+ <codeph>FLOAT</codeph>, and <codeph>DOUBLE</codeph>), use a sequence of digits with an optional decimal
+ point (<codeph>.</codeph> character). To preserve accuracy during arithmetic expressions, Impala interprets
+ floating-point literals as the <codeph>DECIMAL</codeph> type with the smallest appropriate precision and
+ scale, until required by the context to convert the result to <codeph>FLOAT</codeph> or
+ <codeph>DOUBLE</codeph>.
+ </p>
+
+ <p>
+ Integer values are promoted to floating-point when necessary, based on the context.
+ </p>
+
+ <p>
+ You can also use exponential notation by including an <codeph>e</codeph> character. For example,
+ <codeph>1e6</codeph> is 1 times 10 to the power of 6 (1 million). A number in exponential notation is
+ always interpreted as floating-point.
+ </p>
+
+ <p rev="tk">
+ When Impala encounters a numeric literal, it considers the type to be the <q>smallest</q> that can
+ accurately represent the value. The type is promoted to larger or more accurate types if necessary, based
+ on subsequent parts of an expression.
+ </p>
+ <p>
+ For example, you can see by the types Impala defines for the following table columns
+ how it interprets the corresponding numeric literals:
+ </p>
+<codeblock>[localhost:21000] > create table ten as select 10 as x;
++-------------------+
+| summary |
++-------------------+
+| Inserted 1 row(s) |
++-------------------+
+[localhost:21000] > desc ten;
++------+---------+---------+
+| name | type | comment |
++------+---------+---------+
+| x | tinyint | |
++------+---------+---------+
+
+[localhost:21000] > create table four_k as select 4096 as x;
++-------------------+
+| summary |
++-------------------+
+| Inserted 1 row(s) |
++-------------------+
+[localhost:21000] > desc four_k;
++------+----------+---------+
+| name | type | comment |
++------+----------+---------+
+| x | smallint | |
++------+----------+---------+
+
+[localhost:21000] > create table one_point_five as select 1.5 as x;
++-------------------+
+| summary |
++-------------------+
+| Inserted 1 row(s) |
++-------------------+
+[localhost:21000] > desc one_point_five;
++------+--------------+---------+
+| name | type | comment |
++------+--------------+---------+
+| x | decimal(2,1) | |
++------+--------------+---------+
+
+[localhost:21000] > create table one_point_three_three_three as select 1.333 as x;
++-------------------+
+| summary |
++-------------------+
+| Inserted 1 row(s) |
++-------------------+
+[localhost:21000] > desc one_point_three_three_three;
++------+--------------+---------+
+| name | type | comment |
++------+--------------+---------+
+| x | decimal(4,3) | |
++------+--------------+---------+
+</codeblock>
+ </conbody>
+ </concept>
+
+ <concept id="string_literals">
+
+ <title>String Literals</title>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">string literals</indexterm>
+ String literals are quoted using either single or double quotation marks. You can use either kind of quotes
+ for string literals, even both kinds for different literals within the same statement.
+ </p>
+
+ <p rev="2.0.0">
+ Quoted literals are considered to be of type <codeph>STRING</codeph>. To use quoted literals in contexts
+ requiring a <codeph>CHAR</codeph> or <codeph>VARCHAR</codeph> value, <codeph>CAST()</codeph> the literal to
+ a <codeph>CHAR</codeph> or <codeph>VARCHAR</codeph> of the appropriate length.
+ </p>
+
+ <p>
+ <b>Escaping special characters:</b>
+ </p>
+
+ <p>
+ To encode special characters within a string literal, precede them with the backslash (<codeph>\</codeph>)
+ escape character:
+ </p>
+
+ <ul>
+ <li>
+ <codeph>\t</codeph> represents a tab.
+ </li>
+
+ <li>
+ <codeph>\n</codeph> represents a newline or linefeed. This might cause extra line breaks in
+ <cmdname>impala-shell</cmdname> output.
+ </li>
+
+ <li>
+ <codeph>\r</codeph> represents a carriage return. This might cause unusual formatting (making it appear
+ that some content is overwritten) in <cmdname>impala-shell</cmdname> output.
+ </li>
+
+ <li>
+ <codeph>\b</codeph> represents a backspace. This might cause unusual formatting (making it appear that
+ some content is overwritten) in <cmdname>impala-shell</cmdname> output.
+ </li>
+
+ <li>
+ <codeph>\0</codeph> represents an ASCII <codeph>nul</codeph> character (not the same as a SQL
+ <codeph>NULL</codeph>). This might not be visible in <cmdname>impala-shell</cmdname> output.
+ </li>
+
+ <li>
+ <codeph>\Z</codeph> represents a DOS end-of-file character. This might not be visible in
+ <cmdname>impala-shell</cmdname> output.
+ </li>
+
+ <li>
+ <codeph>\%</codeph> and <codeph>\_</codeph> can be used to escape wildcard characters within the string
+ passed to the <codeph>LIKE</codeph> operator.
+ </li>
+
+ <li>
+ <codeph>\</codeph> followed by 3 octal digits represents the ASCII code of a single character; for
+ example, <codeph>\101</codeph> is ASCII 65, the character <codeph>A</codeph>.
+ </li>
+
+ <li>
+ Use two consecutive backslashes (<codeph>\\</codeph>) to prevent the backslash from being interpreted as
+ an escape character.
+ </li>
+
+ <li>
+ Use the backslash to escape single or double quotation mark characters within a string literal, if the
+ literal is enclosed by the same type of quotation mark.
+ </li>
+
+ <li>
+ If the character following the <codeph>\</codeph> does not represent the start of a recognized escape
+ sequence, the character is passed through unchanged.
+ </li>
+ </ul>
+
+ <p>
+ <b>Quotes within quotes:</b>
+ </p>
+
+ <p>
+ To include a single quotation character within a string value, enclose the literal with either single or
+ double quotation marks, and optionally escape the single quote as a <codeph>\'</codeph> sequence. Earlier
+ releases required escaping a single quote inside double quotes. Continue using escape sequences in this
+ case if you also need to run your SQL code on older versions of Impala.
+ </p>
+
+ <p>
+ To include a double quotation character within a string value, enclose the literal with single quotation
+ marks, no escaping is necessary in this case. Or, enclose the literal with double quotation marks and
+ escape the double quote as a <codeph>\"</codeph> sequence.
+ </p>
+
+<codeblock>[localhost:21000] > select "What\'s happening?" as single_within_double,
+ > 'I\'m not sure.' as single_within_single,
+ > "Homer wrote \"The Iliad\"." as double_within_double,
+ > 'Homer also wrote "The Odyssey".' as double_within_single;
++----------------------+----------------------+--------------------------+---------------------------------+
+| single_within_double | single_within_single | double_within_double | double_within_single |
++----------------------+----------------------+--------------------------+---------------------------------+
+| What's happening? | I'm not sure. | Homer wrote "The Iliad". | Homer also wrote "The Odyssey". |
++----------------------+----------------------+--------------------------+---------------------------------+
+</codeblock>
+
+ <p>
+ <b>Field terminator character in CREATE TABLE:</b>
+ </p>
+
+ <note conref="../shared/impala_common.xml#common/thorn"/>
+
+ <p>
+ <b>impala-shell considerations:</b>
+ </p>
+
+ <p>
+ When dealing with output that includes non-ASCII or non-printable characters such as linefeeds and
+ backspaces, use the <cmdname>impala-shell</cmdname> options to save to a file, turn off pretty printing, or
+ both rather than relying on how the output appears visually. See
+ <xref href="impala_shell_options.xml#shell_options"/> for a list of <cmdname>impala-shell</cmdname>
+ options.
+ </p>
+ </conbody>
+ </concept>
+
+ <concept id="boolean_literals">
+
+ <title>Boolean Literals</title>
+
+ <conbody>
+
+ <p>
+ For <codeph>BOOLEAN</codeph> values, the literals are <codeph>TRUE</codeph> and <codeph>FALSE</codeph>,
+ with no quotation marks and case-insensitive.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>select true;
+select * from t1 where assertion = false;
+select case bool_col when true then 'yes' when false 'no' else 'null' end from t1;</codeblock>
+ </conbody>
+ </concept>
+
+ <concept id="timestamp_literals">
+
+ <title>Timestamp Literals</title>
+
+ <conbody>
+
+ <p conref="../shared/impala_common.xml#common/timestamp_conversions"/>
+
+ <p>
+ You can also use <codeph>INTERVAL</codeph> expressions to add or subtract from timestamp literal values,
+ such as <codeph>'1966-07-30' + INTERVAL 5 YEARS + INTERVAL 3 DAYS</codeph>. See
+ <xref href="impala_timestamp.xml#timestamp"/> for details.
+ </p>
+
+ <p>
+ Depending on your data pipeline, you might receive date and time data as text, in notation that does not
+ exactly match the format for Impala <codeph>TIMESTAMP</codeph> literals.
+ See <xref href="impala_datetime_functions.xml#datetime_functions"/> for functions that can convert
+ between a variety of string literals (including different field order, separators, and timezone notation)
+ and equivalent <codeph>TIMESTAMP</codeph> or numeric values.
+ </p>
+ </conbody>
+ </concept>
+
+ <concept id="null">
+
+ <title>NULL</title>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">NULL</indexterm>
+ The notion of <codeph>NULL</codeph> values is familiar from all kinds of database systems, but each SQL
+ dialect can have its own behavior and restrictions on <codeph>NULL</codeph> values. For Big Data
+ processing, the precise semantics of <codeph>NULL</codeph> values are significant: any misunderstanding
+ could lead to inaccurate results or misformatted data, that could be time-consuming to correct for large
+ data sets.
+ </p>
+
+ <ul>
+ <li>
+ <codeph>NULL</codeph> is a different value than an empty string. The empty string is represented by a
+ string literal with nothing inside, <codeph>""</codeph> or <codeph>''</codeph>.
+ </li>
+
+ <li>
+ In a delimited text file, the <codeph>NULL</codeph> value is represented by the special token
+ <codeph>\N</codeph>.
+ </li>
+
+ <li>
+ When Impala inserts data into a partitioned table, and the value of one of the partitioning columns is
+ <codeph>NULL</codeph> or the empty string, the data is placed in a special partition that holds only
+ these two kinds of values. When these values are returned in a query, the result is <codeph>NULL</codeph>
+ whether the value was originally <codeph>NULL</codeph> or an empty string. This behavior is compatible
+ with the way Hive treats <codeph>NULL</codeph> values in partitioned tables. Hive does not allow empty
+ strings as partition keys, and it returns a string value such as
+ <codeph>__HIVE_DEFAULT_PARTITION__</codeph> instead of <codeph>NULL</codeph> when such values are
+ returned from a query. For example:
+<codeblock>create table t1 (i int) partitioned by (x int, y string);
+-- Select an INT column from another table, with all rows going into a special HDFS subdirectory
+-- named __HIVE_DEFAULT_PARTITION__. Depending on whether one or both of the partitioning keys
+-- are null, this special directory name occurs at different levels of the physical data directory
+-- for the table.
+insert into t1 partition(x=NULL, y=NULL) select c1 from some_other_table;
+insert into t1 partition(x, y=NULL) select c1, c2 from some_other_table;
+insert into t1 partition(x=NULL, y) select c1, c3 from some_other_table;</codeblock>
+ </li>
+
+ <li>
+ There is no <codeph>NOT NULL</codeph> clause when defining a column to prevent <codeph>NULL</codeph>
+ values in that column.
+ </li>
+
+ <li>
+ There is no <codeph>DEFAULT</codeph> clause to specify a non-<codeph>NULL</codeph> default value.
+ </li>
+
+ <li>
+ If an <codeph>INSERT</codeph> operation mentions some columns but not others, the unmentioned columns
+ contain <codeph>NULL</codeph> for all inserted rows.
+ </li>
+
+ <li rev="1.2.1">
+ <p conref="../shared/impala_common.xml#common/null_sorting_change"/>
+ <note>
+ <draft-comment translate="no"> Probably a bunch of similar view-related restrictions like this that should be collected, reused, or cross-referenced under the Views topic. </draft-comment>
+ Because the <codeph>NULLS FIRST</codeph> and <codeph>NULLS LAST</codeph> keywords are not currently
+ available in Hive queries, any views you create using those keywords will not be available through
+ Hive.
+ </note>
+ </li>
+
+ <li>
+ In all other contexts besides sorting with <codeph>ORDER BY</codeph>, comparing a <codeph>NULL</codeph>
+ to anything else returns <codeph>NULL</codeph>, making the comparison meaningless. For example,
+ <codeph>10 > NULL</codeph> produces <codeph>NULL</codeph>, <codeph>10 < NULL</codeph> also produces
+ <codeph>NULL</codeph>, <codeph>5 BETWEEN 1 AND NULL</codeph> produces <codeph>NULL</codeph>, and so on.
+ </li>
+ </ul>
+
+ <p>
+ Several built-in functions serve as shorthand for evaluating expressions and returning
+ <codeph>NULL</codeph>, 0, or some other substitution value depending on the expression result:
+ <codeph>ifnull()</codeph>, <codeph>isnull()</codeph>, <codeph>nvl()</codeph>, <codeph>nullif()</codeph>,
+ <codeph>nullifzero()</codeph>, and <codeph>zeroifnull()</codeph>. See
+ <xref href="impala_conditional_functions.xml#conditional_functions"/> for details.
+ </p>
+ </conbody>
+ </concept>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_live_progress.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_live_progress.xml b/docs/topics/impala_live_progress.xml
new file mode 100644
index 0000000..f58cdcb
--- /dev/null
+++ b/docs/topics/impala_live_progress.xml
@@ -0,0 +1,81 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="2.3.0" id="live_progress">
+
+ <title>LIVE_PROGRESS Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Performance"/>
+ <data name="Category" value="Reports"/>
+ <data name="Category" value="impala-shell"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">LIVE_PROGRESS query option</indexterm>
+ For queries submitted through the <cmdname>impala-shell</cmdname> command,
+ displays an interactive progress bar showing roughly what percentage of
+ processing has been completed. When the query finishes, the progress bar is erased
+ from the <cmdname>impala-shell</cmdname> console output.
+ </p>
+
+ <p>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/type_boolean"/>
+ <p conref="../shared/impala_common.xml#common/default_false_0"/>
+
+ <p conref="../shared/impala_common.xml#common/command_line_blurb"/>
+ <p>
+ You can enable this query option within <cmdname>impala-shell</cmdname>
+ by starting the shell with the <codeph>--live_progress</codeph>
+ command-line option.
+ You can still turn this setting off and on again within the shell through the
+ <codeph>SET</codeph> command.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p conref="../shared/impala_common.xml#common/live_reporting_details"/>
+ <p>
+ For a more detailed way of tracking the progress of an interactive query through
+ all phases of processing, see <xref href="impala_live_summary.xml#live_summary"/>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+ <p>
+ Because the percentage complete figure is calculated using the number of
+ issued and completed <q>scan ranges</q>, which occur while reading the table
+ data, the progress bar might reach 100% before the query is entirely finished.
+ For example, the query might do work to perform aggregations after all the
+ table data has been read. If many of your queries fall into this category,
+ consider using the <codeph>LIVE_SUMMARY</codeph> option instead for
+ more granular progress reporting.
+ </p>
+ <p conref="../shared/impala_common.xml#common/impala_shell_progress_reports_compute_stats_caveat"/>
+ <p conref="../shared/impala_common.xml#common/impala_shell_progress_reports_shell_only_caveat"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+<codeblock><![CDATA[[localhost:21000] > set live_progress=true;
+LIVE_PROGRESS set to true
+[localhost:21000] > select count(*) from customer;
++----------+
+| count(*) |
++----------+
+| 150000 |
++----------+
+[localhost:21000] > select count(*) from customer t1 cross join customer t2;
+[################################################## ] 50%
+[####################################################################################################] 100%
+
+]]>
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/live_progress_live_summary_asciinema"/>
+
+ </conbody>
+</concept>
[21/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Posted by jr...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/shared/impala_common.xml
----------------------------------------------------------------------
diff --git a/docs/shared/impala_common.xml b/docs/shared/impala_common.xml
new file mode 100644
index 0000000..37ebc34
--- /dev/null
+++ b/docs/shared/impala_common.xml
@@ -0,0 +1,2477 @@
+<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept xmlns:ditaarch="http://dita.oasis-open.org/architecture/2005/" id="common" ditaarch:DITAArchVersion="1.2" domains="(topic concept) (topic hi-d) (topic ut-d) (topic indexing-d) (topic hazard-d) (topic abbrev-d) (topic pr-d) (topic sw-d) (topic ui-d) " xml:lang="en-US">
+
+ <title>Reusable Text, Paragraphs, List Items, and Other Elements for Impala</title>
+
+ <conbody>
+
+ <p>
+ All the elements in this file with IDs are intended to be conref'ed elsewhere. Practically all of the
+ conref'ed elements for the Impala docs are in this file, to avoid questions of when it's safe to remove or
+ move something in any of the 'main' files, and avoid having to change and conref references as a result.
+ </p>
+
+ <p>
+ This file defines some dummy subheadings as section elements, just for self-documentation. Using sections
+ instead of nested concepts lets all the conref links point to a very simple name pattern,
+ '#common/id_within_the_file', rather than a 3-part reference with an intervening, variable concept ID.
+ </p>
+
+ <section id="sentry">
+
+ <title>Sentry-Related Content</title>
+
+ <p>
+ Material related to Sentry security, intended to be reused between Hive and Impala. Complicated by the fact
+ that most of it will probably be multi-paragraph or involve subheads, might need to be represented as
+ nested topics at the end of this file.
+ </p>
+
+ <note id="authentication_vs_authorization">
+ Regardless of the authentication mechanism used, Impala always creates HDFS directories and data files
+ owned by the same user (typically <codeph>impala</codeph>). To implement user-level access to different
+ databases, tables, columns, partitions, and so on, use the Sentry authorization feature, as explained in
+ <xref href="../topics/impala_authorization.xml#authorization"/>.
+ </note>
+
+<!-- Contrived nesting needed to allow <ph> with ID to be reused inside the <title> of a conref. -->
+
+ <p>
+ <b><ph id="title_sentry_debug">Debugging Failed Sentry Authorization Requests</ph></b>
+ </p>
+
+ <p id="sentry_debug">
+ Sentry logs all facts that lead up to authorization decisions at the debug level. If you do not understand
+ why Sentry is denying access, the best way to debug is to temporarily turn on debug logging:
+ <ul>
+ <li>
+ In Cloudera Manager, add <codeph>log4j.logger.org.apache.sentry=DEBUG</codeph> to the logging settings
+ for your service through the corresponding <uicontrol>Logging Safety Valve</uicontrol> field for the
+ Impala, Hive Server 2, or Solr Server services.
+ </li>
+
+ <li>
+ On systems not managed by Cloudera Manager, add <codeph>log4j.logger.org.apache.sentry=DEBUG</codeph>
+ to the <filepath>log4j.properties</filepath> file on each host in the cluster, in the appropriate
+ configuration directory for each service.
+ </li>
+ </ul>
+ Specifically, look for exceptions and messages such as:
+<codeblock xml:space="preserve">FilePermission server..., RequestPermission server...., result [true|false]</codeblock>
+ which indicate each evaluation Sentry makes. The <codeph>FilePermission</codeph> is from the policy file,
+ while <codeph>RequestPermission</codeph> is the privilege required for the query. A
+ <codeph>RequestPermission</codeph> will iterate over all appropriate <codeph>FilePermission</codeph>
+ settings until a match is found. If no matching privilege is found, Sentry returns <codeph>false</codeph>
+ indicating <q>Access Denied</q> .
+<!--
+[1]
+Impala: Impala Daemon -> Advanced -> Impala Daemon Logging Safety Valve
+Hive: Hive Server 2 -> Advanced -> HiveServer2 Logging Safety Valve
+Search: Solr Server -> Advanced -> HiveServer2 Logging Safety Valve
+-->
+ </p>
+
+ </section>
+
+ <section id="cm">
+
+ <title>Cloudera Manager Terminology</title>
+
+ <p>
+ Especially during the transition from CM 4 to CM 5, we'll use some stock phraseology to talk about fields
+ and such.
+ </p>
+
+ <p>
+ <ph id="safety_valve"> In Cloudera Manager 4, these fields are labelled <uicontrol>Safety
+ Valve</uicontrol>; in Cloudera Manager 5, they are called <uicontrol>Advanced Configuration
+ Snippet</uicontrol>. </ph>
+ </p>
+
+ </section>
+
+ <section id="citi">
+
+ <title>Items from the Citibank Escalation Spreadsheet</title>
+
+ <p>
+ Paragraphs with IDs are intended to be reused both in the FAQ and the User's Guide. They refer to feature
+ requests or misunderstandings encountered by Citibank, captured in the escalation spreadsheet here:
+ <xref href="https://docs.google.com/a/cloudera.com/spreadsheet/ccc?key=0AplfwQJKyyTWdFdhY0E5WHVwNXZSTG9sMEZwQy1QZ1E&usp=drive_web#gid=0" scope="external" format="html"/>.
+ </p>
+
+ <p id="string_concatenation">
+ With Impala, you use the built-in <codeph>CONCAT()</codeph> function to concatenate two, three, or more
+ strings:
+<codeblock xml:space="preserve">select concat('some prefix: ', col1) from t1;
+select concat('abc','mno','xyz');</codeblock>
+ Impala does not currently support operators for string concatenation, such as <codeph>||</codeph> as seen
+ in some other database systems.
+ </p>
+
+ <p id="column_aliases">
+ You can specify column aliases with or without the <codeph>AS</codeph> keyword, and with no quotation
+ marks, single quotation marks, or double quotation marks. Some kind of quotation marks are required if the
+ column alias contains any spaces or other problematic characters. The alias text is displayed in the
+ <cmdname>impala-shell</cmdname> output as all-lowercase. For example:
+<codeblock xml:space="preserve">[localhost:21000] > select c1 First_Column from t;
+[localhost:21000] > select c1 as First_Column from t;
++--------------+
+| first_column |
++--------------+
+...
+
+[localhost:21000] > select c1 'First Column' from t;
+[localhost:21000] > select c1 as 'First Column' from t;
++--------------+
+| first column |
++--------------+
+...
+
+[localhost:21000] > select c1 "First Column" from t;
+[localhost:21000] > select c1 as "First Column" from t;
++--------------+
+| first column |
++--------------+
+...</codeblock>
+ </p>
+
+ <p id="temp_tables">
+ Currently, Impala does not support temporary tables. Some other database systems have a class of
+ <q>lightweight</q> tables that are held only in memory and/or that are only accessible by one connection
+ and disappear when the session ends. In Impala, creating new databases is a relatively lightweight
+ operation, so as an alternative, you could create a database with a unique name and use <codeph>CREATE
+ TABLE LIKE</codeph>, <codeph>CREATE TABLE AS SELECT</codeph>, and <codeph>INSERT</codeph> statements to
+ create a table in that database to hold the result set of a query, to use in subsequent queries. When
+ finished, issue a <codeph>DROP TABLE</codeph> statement followed by <codeph>DROP DATABASE</codeph>.
+ </p>
+
+ </section>
+
+ <section id="standards">
+
+ <title>Blurbs About Standards Compliance</title>
+
+ <p>
+ The following blurbs simplify the process of flagging which SQL standard various features were first
+ introduced in. The wording and the tagging can be modified by editing one central instance of each blurb.
+ Not extensively used yet, just here and there in the SQL Language Reference section.
+ </p>
+
+ <p id="sql1986">
+<!-- No Wikipedia page for SQL-1986, so no link. -->
+ <b>Standards compliance:</b> Introduced in SQL-1986.
+ </p>
+
+ <p id="sql1989">
+<!-- No Wikipedia page for SQL-1989, so no link. -->
+ <b>Standards compliance:</b> Introduced in SQL-1989.
+ </p>
+
+ <p id="sql1992">
+ <b>Standards compliance:</b> Introduced in
+ <xref href="http://en.wikipedia.org/wiki/SQL-92" scope="external" format="html">SQL-1992</xref>.
+ </p>
+
+ <p id="sql1999">
+ <b>Standards compliance:</b> Introduced in
+ <xref href="http://en.wikipedia.org/wiki/SQL:1999" scope="external" format="html">SQL:1999</xref>.
+ </p>
+
+ <p id="sql2003">
+ <b>Standards compliance:</b> Introduced in
+ <xref href="http://en.wikipedia.org/wiki/SQL:2003" scope="external" format="html">SQL:2003</xref>.
+ </p>
+
+ <p id="sql2008">
+ <b>Standards compliance:</b> Introduced in
+ <xref href="http://en.wikipedia.org/wiki/SQL:2008" scope="external" format="html">SQL:2008</xref>.
+ </p>
+
+ <p id="sql2011">
+ <b>Standards compliance:</b> Introduced in
+ <xref href="http://en.wikipedia.org/wiki/SQL:2011" scope="external" format="html">SQL:2011</xref>.
+ </p>
+
+ <p id="hiveql">
+ <b>Standards compliance:</b> Extension first introduced in HiveQL.
+ </p>
+
+ <p id="impalaql">
+ <b>Standards compliance:</b> Extension first introduced in Impala.
+ </p>
+
+ </section>
+
+ <section id="refresh_invalidate">
+
+ <title>Background Info for REFRESH, INVALIDATE METADATA, and General Metadata Discussion</title>
+
+ <p id="refresh_vs_invalidate">
+ <codeph>INVALIDATE METADATA</codeph> and <codeph>REFRESH</codeph> are counterparts: <codeph>INVALIDATE
+ METADATA</codeph> waits to reload the metadata when needed for a subsequent query, but reloads all the
+ metadata for the table, which can be an expensive operation, especially for large tables with many
+ partitions. <codeph>REFRESH</codeph> reloads the metadata immediately, but only loads the block location
+ data for newly added data files, making it a less expensive operation overall. If data was altered in some
+ more extensive way, such as being reorganized by the HDFS balancer, use <codeph>INVALIDATE
+ METADATA</codeph> to avoid a performance penalty from reduced local reads. If you used Impala version 1.0,
+ the <codeph>INVALIDATE METADATA</codeph> statement works just like the Impala 1.0 <codeph>REFRESH</codeph>
+ statement did, while the Impala 1.1 <codeph>REFRESH</codeph> is optimized for the common use case of adding
+ new data files to an existing table, thus the table name argument is now required.
+ </p>
+
+ </section>
+
+ <section id="kudu">
+
+ <title>Kudu Snippets</title>
+
+ <p>
+ If any advice, background info, or warnings are needed in multiple
+ places for interaction of Impala with Kudu, put them under here.
+ </p>
+
+ </section>
+
+ <section id="sql_ref">
+
+ <title>SQL Language Reference Snippets</title>
+
+ <p>
+ These reusable chunks were taken from conrefs originally in <filepath>ciiu_langref_sql.xml</filepath>. Or
+ they are primarily used in new SQL syntax topics underneath that parent topic.
+ </p>
+
+ <p id="live_reporting_details">
+ The output from this query option is printed to standard error. The output is only displayed in interactive mode,
+ that is, not when the <codeph>-q</codeph> or <codeph>-f</codeph> options are used.
+ </p>
+
+ <p id="live_progress_live_summary_asciinema">
+ To see how the <codeph>LIVE_PROGRESS</codeph> and <codeph>LIVE_SUMMARY</codeph> query options
+ work in real time, see <xref href="https://asciinema.org/a/1rv7qippo0fe7h5k1b6k4nexk" scope="external" format="html">this animated demo</xref>.
+ </p>
+
+ <p rev="2.3.0" id="impala_shell_progress_reports_compute_stats_caveat">
+ The <codeph>LIVE_PROGRESS</codeph> and <codeph>LIVE_SUMMARY</codeph> query options
+ currently do not produce any output during <codeph>COMPUTE STATS</codeph> operations.
+ </p>
+
+<!-- This is a shorter version of the similar 'caveat' text. This shorter one can be reused more easily in various places. -->
+ <p rev="2.3.0" id="impala_shell_progress_reports_shell_only_blurb">
+ The <codeph>LIVE_PROGRESS</codeph> and <codeph>LIVE_SUMMARY</codeph> query options only apply
+ inside the <cmdname>impala-shell</cmdname> interpreter. You cannot use them with the
+ <codeph>SET</codeph> statement from a JDBC or ODBC application.
+ </p>
+
+ <p id="impala_shell_progress_reports_shell_only_caveat">
+ Because the <codeph>LIVE_PROGRESS</codeph> and <codeph>LIVE_SUMMARY</codeph> query options
+ are available only within the <cmdname>impala-shell</cmdname> interpreter:
+ <ul>
+ <li>
+ <p>
+ You cannot change these query options through the SQL <codeph>SET</codeph>
+ statement using the JDBC or ODBC interfaces. The <codeph>SET</codeph>
+ command in <cmdname>impala-shell</cmdname> recognizes these names as
+ shell-only options.
+ </p>
+ </li>
+ <li>
+ <p>
+ Be careful when using <cmdname>impala-shell</cmdname> on a pre-CDH 5.5
+ system to connect to Impala running on a CDH 5.5 or higher system.
+ The older <cmdname>impala-shell</cmdname> does not recognize these
+ query option names. Upgrade <cmdname>impala-shell</cmdname> on the
+ systems where you intend to use these query options.
+ </p>
+ </li>
+ <li>
+ <p>
+ Likewise, the <cmdname>impala-shell</cmdname> command relies on
+ some information only available in Impala 2.3 / CDH 5.5 and higher
+ to prepare live progress reports and query summaries. The
+ <codeph>LIVE_PROGRESS</codeph> and <codeph>LIVE_SUMMARY</codeph>
+ query options have no effect when <cmdname>impala-shell</cmdname> connects
+ to a cluster running an older version of Impala.
+ </p>
+ </li>
+ </ul>
+ </p>
+
+<!-- Same example used in both CREATE DATABASE and DROP DATABASE. -->
+<codeblock id="create_drop_db_example">create database first_db;
+use first_db;
+create table t1 (x int);
+
+create database second_db;
+use second_db;
+-- Each database has its own namespace for tables.
+-- You can reuse the same table names in each database.
+create table t1 (s string);
+
+create database temp;
+
+-- You can either USE a database after creating it,
+-- or qualify all references to the table name with the name of the database.
+-- Here, tables T2 and T3 are both created in the TEMP database.
+
+create table temp.t2 (x int, y int);
+use database temp;
+create table t3 (s string);
+
+-- You cannot drop a database while it is selected by the USE statement.
+drop database temp;
+<i>ERROR: AnalysisException: Cannot drop current default database: temp</i>
+
+-- The always-available database 'default' is a convenient one to USE
+-- before dropping a database you created.
+use default;
+
+-- Before dropping a database, first drop all the tables inside it,
+<ph rev="2.3.0">-- or in CDH 5.5 and higher use the CASCADE clause.</ph>
+drop database temp;
+ERROR: ImpalaRuntimeException: Error making 'dropDatabase' RPC to Hive Metastore:
+CAUSED BY: InvalidOperationException: Database temp is not empty
+show tables in temp;
++------+
+| name |
++------+
+| t3 |
++------+
+
+<ph rev="2.3.0">-- CDH 5.5 and higher:</ph>
+<ph rev="2.3.0">drop database temp cascade;</ph>
+
+-- CDH 5.4 and lower:
+drop table temp.t3;
+drop database temp;
+</codeblock>
+
+ <p id="cast_convenience_fn_example">
+ This example shows how to use the <codeph>castto*()</codeph> functions as an equivalent
+ to <codeph>CAST(<varname>value</varname> AS <varname>type</varname>)</codeph> expressions.
+ </p>
+
+ <p id="cast_convenience_fn_usage"><b>Usage notes:</b>
+ A convenience function to skip the SQL <codeph>CAST <varname>value</varname> AS <varname>type</varname></codeph> syntax,
+ for example when programmatically generating SQL statements where a regular function call might be easier to construct.
+ </p>
+
+ <p rev="2.2.0" id="timezone_conversion_caveat">
+ The way this function deals with time zones when converting to or from <codeph>TIMESTAMP</codeph>
+ values is affected by the <codeph>-use_local_tz_for_unix_timestamp_conversions</codeph> startup flag for the
+ <cmdname>impalad</cmdname> daemon. See <xref href="../topics/impala_timestamp.xml#timestamp"/> for details about
+ how Impala handles time zone considerations for the <codeph>TIMESTAMP</codeph> data type.
+ </p>
+
+ <note rev="2.2.0" id="s3_caveat" type="important">
+ <p>
+ Impala query support for Amazon S3 is included in CDH 5.4.0, but is not currently supported or recommended for production use.
+ If you're interested in this feature, try it out in a test environment until we address the issues and limitations needed for production-readiness.
+ </p>
+ </note>
+
+ <p rev="2.2.0" id="s3_dml">
+ Currently, Impala cannot insert or load data into a table or partition that resides in the Amazon
+ Simple Storage Service (S3).
+ Bring data into S3 using the normal S3 transfer mechanisms, then use Impala to query the S3 data.
+ See <xref href="../topics/impala_s3.xml#s3"/> for details about using Impala with S3.
+ </p>
+
+ <p rev="2.2.0" id="s3_metadata">
+ The <codeph>REFRESH</codeph> and <codeph>INVALIDATE METADATA</codeph> statements also cache metadata
+ for tables where the data resides in the Amazon Simple Storage Service (S3).
+ In particular, issue a <codeph>REFRESH</codeph> for a table after adding or removing files
+ in the associated S3 data directory.
+ See <xref href="../topics/impala_s3.xml#s3"/> for details about working with S3 tables.
+ </p>
+
+ <p id="y2k38" rev="2.2.0">
+ In Impala 2.2.0 and higher, built-in functions that accept or return integers representing <codeph>TIMESTAMP</codeph> values
+ use the <codeph>BIGINT</codeph> type for parameters and return values, rather than <codeph>INT</codeph>.
+ This change lets the date and time functions avoid an overflow error that would otherwise occur
+ on January 19th, 2038 (known as the
+ <xref href="http://en.wikipedia.org/wiki/Year_2038_problem" scope="external" format="html"><q>Year 2038 problem</q> or <q>Y2K38 problem</q></xref>).
+ This change affects the <codeph>from_unixtime()</codeph> and <codeph>unix_timestamp()</codeph> functions.
+ You might need to change application code that interacts with these functions, change the types of
+ columns that store the return values, or add <codeph>CAST()</codeph> calls to SQL statements that
+ call these functions.
+ </p>
+
+ <p id="timestamp_conversions">
+ Impala automatically converts <codeph>STRING</codeph> literals of the correct format into
+ <codeph>TIMESTAMP</codeph> values. Timestamp values are accepted in the format
+ <codeph>"yyyy-MM-dd HH:mm:ss.SSSSSS"</codeph>, and can consist of just the date, or just the time, with or
+ without the fractional second portion. For example, you can specify <codeph>TIMESTAMP</codeph> values such as
+ <codeph>'1966-07-30'</codeph>, <codeph>'08:30:00'</codeph>, or <codeph>'1985-09-25 17:45:30.005'</codeph>.
+ <ph conref="../shared/impala_common.xml#common/cast_int_to_timestamp"/>
+ </p>
+
+
+ <p>
+ <ph id="cast_int_to_timestamp">Casting an integer or floating-point value <codeph>N</codeph> to
+ <codeph>TIMESTAMP</codeph> produces a value that is <codeph>N</codeph> seconds past the start of the epoch
+ date (January 1, 1970). By default, the result value represents a date and time in the UTC time zone.
+ If the setting <codeph>-use_local_tz_for_unix_timestamp_conversions=true</codeph> is in effect,
+ the resulting <codeph>TIMESTAMP</codeph> represents a date and time in the local time zone.</ph>
+ </p>
+
+ <p id="redaction_yes" rev="2.2.0">
+ If these statements in your environment contain sensitive literal values such as credit card numbers or tax
+ identifiers, Impala can redact this sensitive information when displaying the statements in log files and
+ other administrative contexts. See
+ <xref audience="integrated" href="../topics/sg_redaction.xml#log_redact"/><xref audience="standalone" href="http://www.cloudera.com/content/cloudera/en/documentation/core/latest/topics/sg_redaction.html" scope="external" format="html"/>
+ for details.
+ </p>
+
+ <p id="incremental_partition_spec">
+ The <codeph>PARTITION</codeph> clause is only allowed in combination with the <codeph>INCREMENTAL</codeph>
+ clause. It is optional for <codeph>COMPUTE INCREMENTAL STATS</codeph>, and required for <codeph>DROP
+ INCREMENTAL STATS</codeph>. Whenever you specify partitions through the <codeph>PARTITION
+ (<varname>partition_spec</varname>)</codeph> clause in a <codeph>COMPUTE INCREMENTAL STATS</codeph> or
+ <codeph>DROP INCREMENTAL STATS</codeph> statement, you must include all the partitioning columns in the
+ specification, and specify constant values for all the partition key columns.
+ </p>
+
+ <p id="udf_persistence_restriction">
+ Currently, Impala UDFs and UDAs are not persisted in the metastore database. Information
+ about these functions is held in the memory of the <cmdname>catalogd</cmdname> daemon. You must reload them
+ by running the <codeph>CREATE FUNCTION</codeph> statements again each time you restart the
+ <cmdname>catalogd</cmdname> daemon.
+ </p>
+
+ <note id="add_partition_set_location">
+ If you are creating a partition for the first time and specifying its location, for maximum efficiency, use
+ a single <codeph>ALTER TABLE</codeph> statement including both the <codeph>ADD PARTITION</codeph> and
+ <codeph>LOCATION</codeph> clauses, rather than separate statements with <codeph>ADD PARTITION</codeph> and
+ <codeph>SET LOCATION</codeph> clauses.
+ </note>
+
+ <p id="insert_hidden_work_directory">
+ The <codeph>INSERT</codeph> statement has always left behind a hidden work directory inside the data
+ directory of the table. Formerly, this hidden work directory was named
+ <filepath>.impala_insert_staging</filepath> . In Impala 2.0.1 and later, this directory name is changed to
+ <filepath>_impala_insert_staging</filepath> . (While HDFS tools are expected to treat names beginning
+ either with underscore and dot as hidden, in practice names beginning with an underscore are more widely
+ supported.) If you have any scripts, cleanup jobs, and so on that rely on the name of this work directory,
+ adjust them to use the new name.
+ </p>
+
+ <p id="check_internal_external_table">
+ To see whether a table is internal or external, and its associated HDFS location, issue the statement
+ <codeph>DESCRIBE FORMATTED <varname>table_name</varname></codeph>. The <codeph>Table Type</codeph> field
+ displays <codeph>MANAGED_TABLE</codeph> for internal tables and <codeph>EXTERNAL_TABLE</codeph> for
+ external tables. The <codeph>Location</codeph> field displays the path of the table directory as an HDFS
+ URI.
+ </p>
+
+ <p id="switch_internal_external_table">
+ You can switch a table from internal to external, or from external to internal, by using the <codeph>ALTER
+ TABLE</codeph> statement:
+<codeblock xml:space="preserve">
+-- Switch a table from internal to external.
+ALTER TABLE <varname>table_name</varname> SET TBLPROPERTIES('EXTERNAL'='TRUE');
+
+-- Switch a table from external to internal.
+ALTER TABLE <varname>table_name</varname> SET TBLPROPERTIES('EXTERNAL'='FALSE');
+</codeblock>
+ </p>
+
+<!-- The data to show sensible output from these queries is in the TPC-DS schema 'CUSTOMER' table.
+ If you want to show real output, add a LIMIT 5 or similar clause to each query to avoid
+ too-long output. -->
+
+<codeblock id="regexp_rlike_examples" xml:space="preserve">-- Find all customers whose first name starts with 'J', followed by 0 or more of any character.
+select c_first_name, c_last_name from customer where c_first_name regexp '^J.*';
+select c_first_name, c_last_name from customer where c_first_name rlike '^J.*';
+
+-- Find 'Macdonald', where the first 'a' is optional and the 'D' can be upper- or lowercase.
+-- The ^...$ are required, to match the start and end of the value.
+select c_first_name, c_last_name from customer where c_last_name regexp '^Ma?c[Dd]onald$';
+select c_first_name, c_last_name from customer where c_last_name rlike '^Ma?c[Dd]onald$';
+
+-- Match multiple character sequences, either 'Mac' or 'Mc'.
+select c_first_name, c_last_name from customer where c_last_name regexp '^(Mac|Mc)donald$';
+select c_first_name, c_last_name from customer where c_last_name rlike '^(Mac|Mc)donald$';
+
+-- Find names starting with 'S', then one or more vowels, then 'r', then any other characters.
+-- Matches 'Searcy', 'Sorenson', 'Sauer'.
+select c_first_name, c_last_name from customer where c_last_name regexp '^S[aeiou]+r.*$';
+select c_first_name, c_last_name from customer where c_last_name rlike '^S[aeiou]+r.*$';
+
+-- Find names that end with 2 or more vowels: letters from the set a,e,i,o,u.
+select c_first_name, c_last_name from customer where c_last_name regexp '.*[aeiou]{2,}$';
+select c_first_name, c_last_name from customer where c_last_name rlike '.*[aeiou]{2,}$';
+
+-- You can use letter ranges in the [] blocks, for example to find names starting with A, B, or C.
+select c_first_name, c_last_name from customer where c_last_name regexp '^[A-C].*';
+select c_first_name, c_last_name from customer where c_last_name rlike '^[A-C].*';
+
+-- If you are not sure about case, leading/trailing spaces, and so on, you can process the
+-- column using string functions first.
+select c_first_name, c_last_name from customer where lower(trim(c_last_name)) regexp '^de.*';
+select c_first_name, c_last_name from customer where lower(trim(c_last_name)) rlike '^de.*';
+</codeblock>
+
+ <p id="show_security">
+ When authorization is enabled, the output of the <codeph>SHOW</codeph> statement is limited to those
+ objects for which you have some privilege. There might be other database, tables, and so on, but their
+ names are concealed. If you believe an object exists but you cannot see it in the <codeph>SHOW</codeph>
+ output, check with the system administrator if you need to be granted a new privilege for that object. See
+ <xref href="../topics/impala_authorization.xml#authorization"/> for how to set up authorization and add
+ privileges for specific kinds of objects.
+ </p>
+
+ <p rev="2.0.0" id="user_kerberized">
+ In Impala 2.0 and later, <codeph>user()</codeph> returns the the full Kerberos principal string, such as
+ <codeph>user@example.com</codeph>, in a Kerberized environment.
+ </p>
+
+ <ul>
+ <li id="grant_revoke_single">
+ Currently, each Impala <codeph>GRANT</codeph> or <codeph>REVOKE</codeph> statement can only grant or
+ revoke a single privilege to or from a single role.
+ </li>
+ </ul>
+
+ <p id="blobs_are_strings">
+ All data in <codeph>CHAR</codeph> and <codeph>VARCHAR</codeph> columns must be in a character encoding that
+ is compatible with UTF-8. If you have binary data from another database system (that is, a BLOB type), use
+ a <codeph>STRING</codeph> column to hold it.
+ </p>
+
+<!-- The codeblock is nested inside this paragraph, so the intro text
+ and the code get conref'ed as a unit. -->
+
+ <p id="create_drop_view_examples">
+ The following example creates a series of views and then drops them. These examples illustrate how views
+ are associated with a particular database, and both the view definitions and the view names for
+ <codeph>CREATE VIEW</codeph> and <codeph>DROP VIEW</codeph> can refer to a view in the current database or
+ a fully qualified view name.
+<codeblock xml:space="preserve">
+-- Create and drop a view in the current database.
+CREATE VIEW few_rows_from_t1 AS SELECT * FROM t1 LIMIT 10;
+DROP VIEW few_rows_from_t1;
+
+-- Create and drop a view referencing a table in a different database.
+CREATE VIEW table_from_other_db AS SELECT x FROM db1.foo WHERE x IS NOT NULL;
+DROP VIEW table_from_other_db;
+
+USE db1;
+-- Create a view in a different database.
+CREATE VIEW db2.v1 AS SELECT * FROM db2.foo;
+-- Switch into the other database and drop the view.
+USE db2;
+DROP VIEW v1;
+
+USE db1;
+-- Create a view in a different database.
+CREATE VIEW db2.v1 AS SELECT * FROM db2.foo;
+-- Drop a view in the other database.
+DROP VIEW db2.v1;
+</codeblock>
+ </p>
+
+ <p id="char_varchar_cast_from_string">
+ For <codeph>INSERT</codeph> operations into <codeph>CHAR</codeph> or <codeph>VARCHAR</codeph> columns, you
+ must cast all <codeph>STRING</codeph> literals or expressions returning <codeph>STRING</codeph> to to a
+ <codeph>CHAR</codeph> or <codeph>VARCHAR</codeph> type with the appropriate length.
+ </p>
+
+ <p rev="2.0.0" id="subquery_no_limit">
+ Correlated subqueries used in <codeph>EXISTS</codeph> and <codeph>IN</codeph> operators cannot include a
+ <codeph>LIMIT</codeph> clause.
+ </p>
+
+ <p id="avro_no_timestamp">
+ Currently, Avro tables cannot contain <codeph>TIMESTAMP</codeph> columns. If you need to store date and
+ time values in Avro tables, as a workaround you can use a <codeph>STRING</codeph> representation of the
+ values, convert the values to <codeph>BIGINT</codeph> with the <codeph>UNIX_TIMESTAMP()</codeph> function,
+ or create separate numeric columns for individual date and time fields using the <codeph>EXTRACT()</codeph>
+ function.
+ </p>
+
+ <p id="zero_length_strings">
+ <b>Zero-length strings:</b> For purposes of clauses such as <codeph>DISTINCT</codeph> and <codeph>GROUP
+ BY</codeph>, Impala considers zero-length strings (<codeph>""</codeph>), <codeph>NULL</codeph>, and space
+ to all be different values.
+ </p>
+
+ <p id="order_by_scratch_dir">
+ By default, intermediate files used during large sort, join, aggregation, or analytic function operations
+ are stored in the directory <filepath>/tmp/impala-scratch</filepath> . These files are removed when the
+ operation finishes. (Multiple concurrent queries can perform operations that use the <q>spill to disk</q>
+ technique, without any name conflicts for these temporary files.) You can specify a different location by
+ starting the <cmdname>impalad</cmdname> daemon with the
+ <codeph>--scratch_dirs="<varname>path_to_directory</varname>"</codeph> configuration option or the
+ equivalent configuration option in the Cloudera Manager user interface. You can specify a single directory,
+ or a comma-separated list of directories. The scratch directories must be on the local filesystem, not in
+ HDFS. You might specify different directory paths for different hosts, depending on the capacity and speed
+ of the available storage devices. In CDH 5.5 / Impala 2.3 or higher, Impala successfully starts (with a warning
+ written to the log) if it cannot create or read and write files in one of the scratch directories.
+ If there is less than 1 GB free on the filesystem where that directory resides, Impala still runs, but writes a
+ warning message to its log. If Impala encounters an error reading or writing files in a scratch directory during
+ a query, Impala logs the error and the query fails.
+ </p>
+
+ <p id="order_by_view_restriction">
+ An <codeph>ORDER BY</codeph> clause without an additional <codeph>LIMIT</codeph> clause is ignored in any
+ view definition. If you need to sort the entire result set from a view, use an <codeph>ORDER BY</codeph>
+ clause in the <codeph>SELECT</codeph> statement that queries the view. You can still make a simple <q>top
+ 10</q> report by combining the <codeph>ORDER BY</codeph> and <codeph>LIMIT</codeph> clauses in the same
+ view definition:
+<codeblock xml:space="preserve">[localhost:21000] > create table unsorted (x bigint);
+[localhost:21000] > insert into unsorted values (1), (9), (3), (7), (5), (8), (4), (6), (2);
+[localhost:21000] > create view sorted_view as select x from unsorted order by x;
+[localhost:21000] > select x from sorted_view; -- ORDER BY clause in view has no effect.
++---+
+| x |
++---+
+| 1 |
+| 9 |
+| 3 |
+| 7 |
+| 5 |
+| 8 |
+| 4 |
+| 6 |
+| 2 |
++---+
+[localhost:21000] > select x from sorted_view order by x; -- View query requires ORDER BY at outermost level.
++---+
+| x |
++---+
+| 1 |
+| 2 |
+| 3 |
+| 4 |
+| 5 |
+| 6 |
+| 7 |
+| 8 |
+| 9 |
++---+
+[localhost:21000] > create view top_3_view as select x from unsorted order by x limit 3;
+[localhost:21000] > select x from top_3_view; -- ORDER BY and LIMIT together in view definition are preserved.
++---+
+| x |
++---+
+| 1 |
+| 2 |
+| 3 |
++---+
+</codeblock>
+ </p>
+
+ <p id="precision_scale_example">
+ The following examples demonstrate how to check the precision and scale of numeric literals or other
+ numeric expressions. Impala represents numeric literals in the smallest appropriate type. 5 is a
+ <codeph>TINYINT</codeph> value, which ranges from -128 to 127, therefore 3 decimal digits are needed to
+ represent the entire range, and because it is an integer value there are no fractional digits. 1.333 is
+ interpreted as a <codeph>DECIMAL</codeph> value, with 4 digits total and 3 digits after the decimal point.
+<codeblock xml:space="preserve">[localhost:21000] > select precision(5), scale(5);
++--------------+----------+
+| precision(5) | scale(5) |
++--------------+----------+
+| 3 | 0 |
++--------------+----------+
+[localhost:21000] > select precision(1.333), scale(1.333);
++------------------+--------------+
+| precision(1.333) | scale(1.333) |
++------------------+--------------+
+| 4 | 3 |
++------------------+--------------+
+[localhost:21000] > with t1 as
+ ( select cast(12.34 as decimal(20,2)) x union select cast(1 as decimal(8,6)) x )
+ select precision(x), scale(x) from t1 limit 1;
++--------------+----------+
+| precision(x) | scale(x) |
++--------------+----------+
+| 24 | 6 |
++--------------+----------+
+</codeblock>
+ </p>
+
+<!-- These 'type_' entries are for query options, where the type doesn't match up exactly with an Impala data type. -->
+
+ <p id="type_boolean">
+ <b>Type:</b> Boolean; recognized values are 1 and 0, or <codeph>true</codeph> and <codeph>false</codeph>;
+ any other value interpreted as <codeph>false</codeph>
+ </p>
+
+ <p id="type_string">
+ <b>Type:</b> string
+ </p>
+
+ <p id="default_false">
+ <b>Default:</b> <codeph>false</codeph>
+ </p>
+
+ <p id="default_false_0">
+ <b>Default:</b> <codeph>false</codeph> (shown as 0 in output of <codeph>SET</codeph> statement)
+ </p>
+
+ <p id="odd_return_type_string">
+ Currently, the return value is always a <codeph>STRING</codeph>. The return type is subject to change in
+ future releases. Always use <codeph>CAST()</codeph> to convert the result to whichever data type is
+ appropriate for your computations.
+ </p>
+
+ <p rev="2.0.0" id="former_odd_return_type_string">
+ <b>Return type:</b> <codeph>DOUBLE</codeph> in Impala 2.0 and higher; <codeph>STRING</codeph> in earlier
+ releases
+ </p>
+
+ <p id="for_compatibility_only">
+ <b>Usage notes:</b> Primarily for compatibility with code containing industry extensions to SQL.
+ </p>
+
+ <p id="return_type_boolean">
+ <b>Return type:</b> <codeph>BOOLEAN</codeph>
+ </p>
+
+ <p id="return_type_double">
+ <b>Return type:</b> <codeph>DOUBLE</codeph>
+ </p>
+
+ <p id="return_type_same">
+ <b>Return type:</b> Same as the input value
+ </p>
+
+ <p id="return_type_same_except_string">
+ <b>Return type:</b> Same as the input value, except for <codeph>CHAR</codeph> and <codeph>VARCHAR</codeph>
+ arguments which produce a <codeph>STRING</codeph> result
+ </p>
+
+ <p id="builtins_db">
+ Impala includes another predefined database, <codeph>_impala_builtins</codeph>, that serves as the location
+ for the <xref href="../topics/impala_functions.xml#builtins">built-in functions</xref>. To see the built-in
+ functions, use a statement like the following:
+<codeblock xml:space="preserve">show functions in _impala_builtins;
+show functions in _impala_builtins like '*<varname>substring</varname>*';
+</codeblock>
+ </p>
+
+ <p id="sum_double">
+ Due to the way arithmetic on <codeph>FLOAT</codeph> and <codeph>DOUBLE</codeph> columns uses
+ high-performance hardware instructions, and distributed queries can perform these operations in different
+ order for each query, results can vary slightly for aggregate function calls such as <codeph>SUM()</codeph>
+ and <codeph>AVG()</codeph> for <codeph>FLOAT</codeph> and <codeph>DOUBLE</codeph> columns, particularly on
+ large data sets where millions or billions of values are summed or averaged. For perfect consistency and
+ repeatability, use the <codeph>DECIMAL</codeph> data type for such operations instead of
+ <codeph>FLOAT</codeph> or <codeph>DOUBLE</codeph>.
+ </p>
+
+ <p id="float_double_decimal_caveat">
+ The inability to exactly represent certain floating-point values means that
+ <codeph>DECIMAL</codeph> is sometimes a better choice than <codeph>DOUBLE</codeph>
+ or <codeph>FLOAT</codeph> when precision is critical, particularly when
+ transferring data from other database systems that use different representations
+ or file formats.
+ </p>
+
+ <p rev="1.4.0" id="decimal_no_stats">
+ Currently, the <codeph>COMPUTE STATS</codeph> statement under CDH 4 does not store any statistics for
+ <codeph>DECIMAL</codeph> columns. When Impala runs under CDH 5, which has better support for
+ <codeph>DECIMAL</codeph> in the metastore database, <codeph>COMPUTE STATS</codeph> does collect statistics
+ for <codeph>DECIMAL</codeph> columns and Impala uses the statistics to optimize query performance.
+ </p>
+
+ <p id="datetime_function_chaining">
+ <codeph>unix_timestamp()</codeph> and <codeph>from_unixtime()</codeph> are often used in combination to
+ convert a <codeph>TIMESTAMP</codeph> value into a particular string format. For example:
+<codeblock xml:space="preserve">select from_unixtime(unix_timestamp(now() + interval 3 days), 'yyyy/MM/dd HH:mm');
+</codeblock>
+ </p>
+
+ <p rev="1.4.0 obwl" id="insert_sort_blurb">
+ <b>Sorting considerations:</b> Although you can specify an <codeph>ORDER BY</codeph> clause in an
+ <codeph>INSERT ... SELECT</codeph> statement, any <codeph>ORDER BY</codeph> clause is ignored and the
+ results are not necessarily sorted. An <codeph>INSERT ... SELECT</codeph> operation potentially creates
+ many different data files, prepared on different data nodes, and therefore the notion of the data being
+ stored in sorted order is impractical.
+ </p>
+
+ <p rev="1.4.0" id="create_table_like_view">
+ Prior to Impala 1.4.0, it was not possible to use the <codeph>CREATE TABLE LIKE
+ <varname>view_name</varname></codeph> syntax. In Impala 1.4.0 and higher, you can create a table with the
+ same column definitions as a view using the <codeph>CREATE TABLE LIKE</codeph> technique. Although
+ <codeph>CREATE TABLE LIKE</codeph> normally inherits the file format of the original table, a view has no
+ underlying file format, so <codeph>CREATE TABLE LIKE <varname>view_name</varname></codeph> produces a text
+ table by default. To specify a different file format, include a <codeph>STORED AS
+ <varname>file_format</varname></codeph> clause at the end of the <codeph>CREATE TABLE LIKE</codeph>
+ statement.
+ </p>
+
+ <note rev="1.4.0" id="compute_stats_nulls">
+ Prior to Impala 1.4.0, <codeph>COMPUTE STATS</codeph> counted the number of <codeph>NULL</codeph> values in
+ each column and recorded that figure in the metastore database. Because Impala does not currently make use
+ of the <codeph>NULL</codeph> count during query planning, Impala 1.4.0 and higher speeds up the
+ <codeph>COMPUTE STATS</codeph> statement by skipping this <codeph>NULL</codeph> counting.
+ </note>
+
+ <p rev="1.3.1" id="regexp_matching">
+ In Impala 1.3.1 and higher, the <codeph>REGEXP</codeph> and <codeph>RLIKE</codeph> operators now match a
+ regular expression string that occurs anywhere inside the target string, the same as if the regular
+ expression was enclosed on each side by <codeph>.*</codeph>. See
+ <xref href="../topics/impala_operators.xml#regexp"/> for examples. Previously, these operators only
+ succeeded when the regular expression matched the entire target string. This change improves compatibility
+ with the regular expression support for popular database systems. There is no change to the behavior of the
+ <codeph>regexp_extract()</codeph> and <codeph>regexp_replace()</codeph> built-in functions.
+ </p>
+
+ <p rev="1.3.1" id="insert_inherit_permissions">
+ By default, if an <codeph>INSERT</codeph> statement creates any new subdirectories underneath a partitioned
+ table, those subdirectories are assigned default HDFS permissions for the <codeph>impala</codeph> user. To
+ make each subdirectory have the same permissions as its parent directory in HDFS, specify the
+ <codeph>--insert_inherit_permissions</codeph> startup option for the <cmdname>impalad</cmdname> daemon.
+ </p>
+
+ <note id="multiple_count_distinct">
+ <p>
+ By default, Impala only allows a single <codeph>COUNT(DISTINCT <varname>columns</varname>)</codeph>
+ expression in each query.
+ </p>
+ <p>
+ If you do not need precise accuracy, you can produce an estimate of the distinct values for a column by
+ specifying <codeph>NDV(<varname>column</varname>)</codeph>; a query can contain multiple instances of
+ <codeph>NDV(<varname>column</varname>)</codeph>. To make Impala automatically rewrite
+ <codeph>COUNT(DISTINCT)</codeph> expressions to <codeph>NDV()</codeph>, enable the
+ <codeph>APPX_COUNT_DISTINCT</codeph> query option.
+ </p>
+ <p>
+ To produce the same result as multiple <codeph>COUNT(DISTINCT)</codeph> expressions, you can use the
+ following technique for queries involving a single table:
+ </p>
+<codeblock xml:space="preserve">select v1.c1 result1, v2.c1 result2 from
+ (select count(distinct col1) as c1 from t1) v1
+ cross join
+ (select count(distinct col2) as c1 from t1) v2;
+</codeblock>
+ <p>
+ Because <codeph>CROSS JOIN</codeph> is an expensive operation, prefer to use the <codeph>NDV()</codeph>
+ technique wherever practical.
+ </p>
+ </note>
+
+ <p>
+ <ph id="union_all_vs_union">Prefer <codeph>UNION ALL</codeph> over <codeph>UNION</codeph> when you know the
+ data sets are disjoint or duplicate values are not a problem; <codeph>UNION ALL</codeph> is more efficient
+ because it avoids materializing and sorting the entire result set to eliminate duplicate values.</ph>
+ </p>
+
+ <note id="thorn">
+ The <codeph>CREATE TABLE</codeph> clauses <codeph>FIELDS TERMINATED BY</codeph>, <codeph>ESCAPED
+ BY</codeph>, and <codeph>LINES TERMINATED BY</codeph> have special rules for the string literal used for
+ their argument, because they all require a single character. You can use a regular character surrounded by
+ single or double quotation marks, an octal sequence such as <codeph>'\054'</codeph> (representing a comma),
+ or an integer in the range '-127'..'128' (with quotation marks but no backslash), which is interpreted as a
+ single-byte ASCII character. Negative values are subtracted from 256; for example, <codeph>FIELDS
+ TERMINATED BY '-2'</codeph> sets the field delimiter to ASCII code 254, the <q>Icelandic Thorn</q>
+ character used as a delimiter by some data formats.
+ </note>
+
+ <p id="command_line_blurb">
+ <b>Command-line equivalent:</b>
+ </p>
+
+ <p rev="2.3.0" id="complex_types_blurb">
+ <b>Complex type considerations:</b>
+ </p>
+
+ <p id="complex_types_combo">
+ Because complex types are often used in combination,
+ for example an <codeph>ARRAY</codeph> of <codeph>STRUCT</codeph>
+ elements, if you are unfamiliar with the Impala complex types,
+ start with <xref href="../topics/impala_complex_types.xml#complex_types"/> for
+ background information and usage examples.
+ </p>
+
+ <ul id="complex_types_restrictions">
+ <li>
+ Columns with this data type can only be used in tables or partitions with the Parquet file format.
+ </li>
+ <li>
+ Columns with this data type cannot be used as partition key columns in a partitioned table.
+ </li>
+ <li>
+ The <codeph>COMPUTE STATS</codeph> statement does not produce any statistics for columns of this data type.
+ </li>
+ <li>
+ See <xref href="../topics/impala_complex_types.xml#complex_types_limits"/> for a full list of limitations
+ and associated guidelines about complex type columns.
+ </li>
+ </ul>
+
+ <p rev="2.3.0" id="complex_types_partitioning">
+ Partitioned tables can contain complex type columns.
+ All the partition key columns must be scalar types.
+ </p>
+
+ <p rev="2.3.0" id="complex_types_describe">
+ You can pass a multi-part qualified name to <codeph>DESCRIBE</codeph>
+ to specify an <codeph>ARRAY</codeph>, <codeph>STRUCT</codeph>, or <codeph>MAP</codeph>
+ column and visualize its structure as if it were a table.
+ For example, if table <codeph>T1</codeph> contains an <codeph>ARRAY</codeph> column
+ <codeph>A1</codeph>, you could issue the statement <codeph>DESCRIBE t1.a1</codeph>.
+ If table <codeph>T1</codeph> contained a <codeph>STRUCT</codeph> column <codeph>S1</codeph>,
+ and a field <codeph>F1</codeph> within the <codeph>STRUCT</codeph> was a <codeph>MAP</codeph>,
+ you could issue the statement <codeph>DESCRIBE t1.s1.f1</codeph>.
+ An <codeph>ARRAY</codeph> is shown as a two-column table, with
+ <codeph>ITEM</codeph> and <codeph>POS</codeph> columns.
+ A <codeph>STRUCT</codeph> is shown as a table with each field
+ representing a column in the table.
+ A <codeph>MAP</codeph> is shown as a two-column table, with
+ <codeph>KEY</codeph> and <codeph>VALUE</codeph> columns.
+ </p>
+
+ <note id="complex_type_schema_pointer">
+ Many of the complex type examples refer to tables
+ such as <codeph>CUSTOMER</codeph> and <codeph>REGION</codeph>
+ adapted from the tables used in the TPC-H benchmark.
+ See <xref href="../topics/impala_complex_types.xml#complex_sample_schema"/>
+ for the table definitions.
+ </note>
+
+ <p rev="2.3.0" id="complex_types_unsupported_filetype">
+ <b>Complex type considerations:</b>
+ Although you can create tables in this file format using
+ the complex types (<codeph>ARRAY</codeph>, <codeph>STRUCT</codeph>,
+ and <codeph>MAP</codeph>) available in CDH 5.5 / Impala 2.3 and higher,
+ currently, Impala can query these types only in Parquet tables.
+ </p>
+
+ <p rev="2.3.0" id="complex_types_caveat_no_operator">
+ You cannot refer to a column with a complex data type (<codeph>ARRAY</codeph>, <codeph>STRUCT</codeph>, or <codeph>MAP</codeph>
+ directly in an operator. You can apply operators only to scalar values that make up a complex type
+ (the fields of a <codeph>STRUCT</codeph>, the items of an <codeph>ARRAY</codeph>,
+ or the key or value portion of a <codeph>MAP</codeph>) as part of a join query that refers to
+ the scalar value using the appropriate dot notation or <codeph>ITEM</codeph>, <codeph>KEY</codeph>, or <codeph>VALUE</codeph>
+ pseudocolumn names.
+ </p>
+
+ <p rev="2.3.0" id="udfs_no_complex_types">
+ Currently, Impala UDFs cannot accept arguments or return values of the Impala complex types
+ (<codeph>STRUCT</codeph>, <codeph>ARRAY</codeph>, or <codeph>MAP</codeph>).
+ </p>
+
+ <p rev="2.3.0" id="complex_types_read_only">
+ Impala currently cannot write new data files containing complex type columns.
+ Therefore, although the <codeph>SELECT</codeph> statement works for queries
+ involving complex type columns, you cannot use a statement form that writes
+ data to complex type columns, such as <codeph>CREATE TABLE AS SELECT</codeph> or <codeph>INSERT ... SELECT</codeph>.
+ To create data files containing complex type data, use the Hive <codeph>INSERT</codeph> statement, or another
+ ETL mechanism such as MapReduce jobs, Spark jobs, Pig, and so on.
+ </p>
+
+ <p rev="2.3.0" id="complex_types_views">
+ For tables containing complex type columns (<codeph>ARRAY</codeph>,
+ <codeph>STRUCT</codeph>, or <codeph>MAP</codeph>), you typically use
+ join queries to refer to the complex values. You can use views to
+ hide the join notation, making such tables seem like traditional denormalized
+ tables, and making those tables queryable by business intelligence tools
+ that do not have built-in support for those complex types.
+ See <xref href="../topics/impala_complex_types.xml#complex_types_views"/> for details.
+ </p>
+
+ <p rev="2.3.0" id="complex_types_views_caveat">
+ Because you cannot directly issue <codeph>SELECT <varname>col_name</varname></codeph>
+ against a column of complex type, you cannot use a view or a <codeph>WITH</codeph>
+ clause to <q>rename</q> a column by selecting it with a column alias.
+ </p>
+
+ <p rev="2.3.0" id="jdbc_odbc_complex_types">
+ The Impala complex types (<codeph>STRUCT</codeph>, <codeph>ARRAY</codeph>, or <codeph>MAP</codeph>)
+ are available in CDH 5.5 / Impala 2.3 and higher.
+ To use these types with JDBC requires version 2.5.28 or higher of the Cloudera JDBC Connector for Impala.
+ To use these types with ODBC requires version 2.5.30 or higher of the Cloudera ODBC Connector for Impala.
+ Consider upgrading all JDBC and ODBC drivers at the same time you upgrade from CDH 5.5 or higher.
+ </p>
+
+ <p rev="2.3.0" id="jdbc_odbc_complex_types_views">
+ Although the result sets from queries involving complex types consist of all scalar values,
+ the queries involve join notation and column references that might not be understood by
+ a particular JDBC or ODBC connector. Consider defining a view that represents the
+ flattened version of a table containing complex type columns, and pointing the JDBC
+ or ODBC application at the view.
+ See <xref href="../topics/impala_complex_types.xml#complex_types"/> for details.
+ </p>
+
+ <p rev="2.3.0" id="complex_types_aggregation_explanation">
+ To access a column with a complex type (<codeph>ARRAY</codeph>, <codeph>STRUCT</codeph>, or <codeph>MAP</codeph>)
+ in an aggregation function, you unpack the individual elements using join notation in the query,
+ and then apply the function to the final scalar item, field, key, or value at the bottom of any nested type hierarchy in the column.
+ See <xref href="../topics/impala_complex_types.xml#complex_types"/> for details about using complex types in Impala.
+ </p>
+
+<p rev="2.3.0" id="complex_types_aggregation_example">
+The following example demonstrates calls to several aggregation functions
+using values from a column containing nested complex types
+(an <codeph>ARRAY</codeph> of <codeph>STRUCT</codeph> items).
+The array is unpacked inside the query using join notation.
+The array elements are referenced using the <codeph>ITEM</codeph>
+pseudocolumn, and the structure fields inside the array elements
+are referenced using dot notation.
+Numeric values such as <codeph>SUM()</codeph> and <codeph>AVG()</codeph>
+are computed using the numeric <codeph>R_NATIONKEY</codeph> field, and
+the general-purpose <codeph>MAX()</codeph> and <codeph>MIN()</codeph>
+values are computed from the string <codeph>N_NAME</codeph> field.
+<codeblock>describe region;
++-------------+-------------------------+---------+
+| name | type | comment |
++-------------+-------------------------+---------+
+| r_regionkey | smallint | |
+| r_name | string | |
+| r_comment | string | |
+| r_nations | array<struct< | |
+| | n_nationkey:smallint, | |
+| | n_name:string, | |
+| | n_comment:string | |
+| | >> | |
++-------------+-------------------------+---------+
+
+select r_name, r_nations.item.n_nationkey
+ from region, region.r_nations as r_nations
+order by r_name, r_nations.item.n_nationkey;
++-------------+------------------+
+| r_name | item.n_nationkey |
++-------------+------------------+
+| AFRICA | 0 |
+| AFRICA | 5 |
+| AFRICA | 14 |
+| AFRICA | 15 |
+| AFRICA | 16 |
+| AMERICA | 1 |
+| AMERICA | 2 |
+| AMERICA | 3 |
+| AMERICA | 17 |
+| AMERICA | 24 |
+| ASIA | 8 |
+| ASIA | 9 |
+| ASIA | 12 |
+| ASIA | 18 |
+| ASIA | 21 |
+| EUROPE | 6 |
+| EUROPE | 7 |
+| EUROPE | 19 |
+| EUROPE | 22 |
+| EUROPE | 23 |
+| MIDDLE EAST | 4 |
+| MIDDLE EAST | 10 |
+| MIDDLE EAST | 11 |
+| MIDDLE EAST | 13 |
+| MIDDLE EAST | 20 |
++-------------+------------------+
+
+select
+ r_name,
+ count(r_nations.item.n_nationkey) as count,
+ sum(r_nations.item.n_nationkey) as sum,
+ avg(r_nations.item.n_nationkey) as average,
+ min(r_nations.item.n_name) as minimum,
+ max(r_nations.item.n_name) as maximum,
+ ndv(r_nations.item.n_nationkey) as distinct_values
+from
+ region, region.r_nations as r_nations
+group by r_name
+order by r_name;
++-------------+-------+-----+---------+-----------+----------------+-----------------+
+| r_name | count | sum | average | minimum | maximum | distinct_values |
++-------------+-------+-----+---------+-----------+----------------+-----------------+
+| AFRICA | 5 | 50 | 10 | ALGERIA | MOZAMBIQUE | 5 |
+| AMERICA | 5 | 47 | 9.4 | ARGENTINA | UNITED STATES | 5 |
+| ASIA | 5 | 68 | 13.6 | CHINA | VIETNAM | 5 |
+| EUROPE | 5 | 77 | 15.4 | FRANCE | UNITED KINGDOM | 5 |
+| MIDDLE EAST | 5 | 58 | 11.6 | EGYPT | SAUDI ARABIA | 5 |
++-------------+-------+-----+---------+-----------+----------------+-----------------+
+</codeblock>
+</p>
+
+ <p id="hive_blurb">
+ <b>Hive considerations:</b>
+ </p>
+
+ <p rev="CDH-19187" id="permissions_blurb">
+ <b>HDFS permissions:</b>
+ </p>
+
+ <p rev="CDH-19187" id="permissions_blurb_no">
+ <b>HDFS permissions:</b> This statement does not touch any HDFS files or directories,
+ therefore no HDFS permissions are required.
+ </p>
+
+ <p id="security_blurb">
+ <b>Security considerations:</b>
+ </p>
+
+ <p id="performance_blurb">
+ <b>Performance considerations:</b>
+ </p>
+
+ <p id="conversion_blurb">
+ <b>Casting and conversions:</b>
+ </p>
+
+ <p id="related_info">
+ <b>Related information:</b>
+ </p>
+
+ <p id="related_tasks">
+ <b>Related tasks:</b>
+ </p>
+
+ <p id="related_options">
+ <b>Related startup options:</b>
+ </p>
+
+ <p id="restrictions_blurb">
+ <b>Restrictions:</b>
+ </p>
+
+ <p rev="2.0.0" id="restrictions_sliding_window">
+ <b>Restrictions:</b> In Impala 2.0 and higher, this function can be used as an analytic function, but with restrictions on any window clause.
+ For <codeph>MAX()</codeph> and <codeph>MIN()</codeph>, the window clause is only allowed if the start
+ bound is <codeph>UNBOUNDED PRECEDING</codeph>.
+ </p>
+
+<!-- This blurb has been superceded by analytic_not_allowed_caveat. Consider removing it if it turns out never to be needed. -->
+ <p rev="2.0.0" id="restrictions_non_analytic">
+ <b>Restrictions:</b> This function cannot be used as an analytic function; it does not currently support
+ the <codeph>OVER()</codeph> clause.
+ </p>
+
+ <p id="compatibility_blurb">
+ <b>Compatibility:</b>
+ </p>
+
+ <p id="null_blurb">
+ <b>NULL considerations:</b>
+ </p>
+
+ <p id="udf_blurb">
+ <b>UDF considerations:</b>
+ </p>
+
+ <p id="udf_blurb_no">
+ <b>UDF considerations:</b> This type cannot be used for the argument or return type of a user-defined
+ function (UDF) or user-defined aggregate function (UDA).
+ </p>
+
+ <p id="view_blurb">
+ <b>Considerations for views:</b>
+ </p>
+
+ <p id="null_bad_numeric_cast">
+ <b>NULL considerations:</b> Casting any non-numeric value to this type produces a <codeph>NULL</codeph>
+ value.
+ </p>
+
+ <p id="null_bad_timestamp_cast">
+ <b>NULL considerations:</b> Casting any unrecognized <codeph>STRING</codeph> value to this type produces a
+ <codeph>NULL</codeph> value.
+ </p>
+
+ <p id="null_null_arguments">
+ <b>NULL considerations:</b> An expression of this type produces a <codeph>NULL</codeph> value if any
+ argument of the expression is <codeph>NULL</codeph>.
+ </p>
+
+ <p id="privileges_blurb">
+ <b>Required privileges:</b>
+ </p>
+
+ <p id="parquet_blurb">
+ <b>Parquet considerations:</b>
+ </p>
+
+ <p id="parquet_tools_blurb">
+ To examine the internal structure and data of Parquet files, you can use the
+ <cmdname>parquet-tools</cmdname> command that comes with CDH. Make sure this
+ command is in your <codeph>$PATH</codeph>. (Typically, it is symlinked from
+ <filepath>/usr/bin</filepath>; sometimes, depending on your installation setup, you
+ might need to locate it under a CDH-specific <codeph>bin</codeph> directory.)
+ The arguments to this command let you perform operations such as:
+ <ul>
+ <li>
+ <codeph>cat</codeph>: Print a file's contents to standard out. In CDH 5.5 and higher, you can use
+ the <codeph>-j</codeph> option to output JSON.
+ </li>
+ <li>
+ <codeph>head</codeph>: Print the first few records of a file to standard output.
+ </li>
+ <li>
+ <codeph>schema</codeph>: Print the Parquet schema for the file.
+ </li>
+ <li>
+ <codeph>meta</codeph>: Print the file footer metadata, including key-value properties (like Avro schema), compression ratios,
+ encodings, compression used, and row group information.
+ </li>
+ <li>
+ <codeph>dump</codeph>: Print all data and metadata.
+ </li>
+ </ul>
+ Use <codeph>parquet-tools -h</codeph> to see usage information for all the arguments.
+ Here are some examples showing <cmdname>parquet-tools</cmdname> usage:
+
+<codeblock><![CDATA[
+$ # Be careful doing this for a big file! Use parquet-tools head to be safe.
+$ parquet-tools cat sample.parq
+year = 1992
+month = 1
+day = 2
+dayofweek = 4
+dep_time = 748
+crs_dep_time = 750
+arr_time = 851
+crs_arr_time = 846
+carrier = US
+flight_num = 53
+actual_elapsed_time = 63
+crs_elapsed_time = 56
+arrdelay = 5
+depdelay = -2
+origin = CMH
+dest = IND
+distince = 182
+cancelled = 0
+diverted = 0
+
+year = 1992
+month = 1
+day = 3
+...
+]]>
+</codeblock>
+
+<codeblock><![CDATA[
+$ parquet-tools head -n 2 sample.parq
+year = 1992
+month = 1
+day = 2
+dayofweek = 4
+dep_time = 748
+crs_dep_time = 750
+arr_time = 851
+crs_arr_time = 846
+carrier = US
+flight_num = 53
+actual_elapsed_time = 63
+crs_elapsed_time = 56
+arrdelay = 5
+depdelay = -2
+origin = CMH
+dest = IND
+distince = 182
+cancelled = 0
+diverted = 0
+
+year = 1992
+month = 1
+day = 3
+...
+]]>
+</codeblock>
+
+<codeblock><![CDATA[
+$ parquet-tools schema sample.parq
+message schema {
+ optional int32 year;
+ optional int32 month;
+ optional int32 day;
+ optional int32 dayofweek;
+ optional int32 dep_time;
+ optional int32 crs_dep_time;
+ optional int32 arr_time;
+ optional int32 crs_arr_time;
+ optional binary carrier;
+ optional int32 flight_num;
+...
+]]>
+</codeblock>
+
+<codeblock><![CDATA[
+$ parquet-tools meta sample.parq
+creator: impala version 2.2.0-cdh5.4.3 (build 517bb0f71cd604a00369254ac6d88394df83e0f6)
+
+file schema: schema
+-------------------------------------------------------------------
+year: OPTIONAL INT32 R:0 D:1
+month: OPTIONAL INT32 R:0 D:1
+day: OPTIONAL INT32 R:0 D:1
+dayofweek: OPTIONAL INT32 R:0 D:1
+dep_time: OPTIONAL INT32 R:0 D:1
+crs_dep_time: OPTIONAL INT32 R:0 D:1
+arr_time: OPTIONAL INT32 R:0 D:1
+crs_arr_time: OPTIONAL INT32 R:0 D:1
+carrier: OPTIONAL BINARY R:0 D:1
+flight_num: OPTIONAL INT32 R:0 D:1
+...
+
+row group 1: RC:20636601 TS:265103674
+-------------------------------------------------------------------
+year: INT32 SNAPPY DO:4 FPO:35 SZ:10103/49723/4.92 VC:20636601 ENC:PLAIN_DICTIONARY,RLE,PLAIN
+month: INT32 SNAPPY DO:10147 FPO:10210 SZ:11380/35732/3.14 VC:20636601 ENC:PLAIN_DICTIONARY,RLE,PLAIN
+day: INT32 SNAPPY DO:21572 FPO:21714 SZ:3071658/9868452/3.21 VC:20636601 ENC:PLAIN_DICTIONARY,RLE,PLAIN
+dayofweek: INT32 SNAPPY DO:3093276 FPO:3093319 SZ:2274375/5941876/2.61 VC:20636601 ENC:PLAIN_DICTIONARY,RLE,PLAIN
+dep_time: INT32 SNAPPY DO:5367705 FPO:5373967 SZ:28281281/28573175/1.01 VC:20636601 ENC:PLAIN_DICTIONARY,RLE,PLAIN
+crs_dep_time: INT32 SNAPPY DO:33649039 FPO:33654262 SZ:10220839/11574964/1.13 VC:20636601 ENC:PLAIN_DICTIONARY,RLE,PLAIN
+arr_time: INT32 SNAPPY DO:43869935 FPO:43876489 SZ:28562410/28797767/1.01 VC:20636601 ENC:PLAIN_DICTIONARY,RLE,PLAIN
+crs_arr_time: INT32 SNAPPY DO:72432398 FPO:72438151 SZ:10908972/12164626/1.12 VC:20636601 ENC:PLAIN_DICTIONARY,RLE,PLAIN
+carrier: BINARY SNAPPY DO:83341427 FPO:83341558 SZ:114916/128611/1.12 VC:20636601 ENC:PLAIN_DICTIONARY,RLE,PLAIN
+flight_num: INT32 SNAPPY DO:83456393 FPO:83488603 SZ:10216514/11474301/1.12 VC:20636601 ENC:PLAIN_DICTIONARY,RLE,PLAIN
+...
+]]>
+</codeblock>
+ </p>
+
+ <p id="parquet_ok">
+ <b>Parquet considerations:</b> This type is fully compatible with Parquet tables.
+ </p>
+
+ <p id="analytic_not_allowed_caveat">
+ This function cannot be used in an analytic context. That is, the <codeph>OVER()</codeph> clause is not allowed at all with this function.
+ </p>
+
+ <p id="impala_parquet_encodings_caveat">
+ Impala can query Parquet files that use the <codeph>PLAIN</codeph>, <codeph>PLAIN_DICTIONARY</codeph>,
+ <codeph>BIT_PACKED</codeph>, and <codeph>RLE</codeph> encodings.
+ Currently, Impala does not support <codeph>RLE_DICTIONARY</codeph> encoding.
+ When creating files outside of Impala for use by Impala, make sure to use one of the supported encodings.
+ In particular, for MapReduce jobs, <codeph>parquet.writer.version</codeph> must not be defined
+ (especially as <codeph>PARQUET_2_0</codeph>) for writing the configurations of Parquet MR jobs.
+ Use the default version (or format). The default format, 1.0, includes some enhancements that are compatible with older versions.
+ Data using the 2.0 format might not be consumable by Impala, due to use of the <codeph>RLE_DICTIONARY</codeph> encoding.
+ </p>
+
+ <note id="restrictions_nonimpala_parquet">
+ <p>
+ Currently, Impala always decodes the column data in Parquet files based on the ordinal position of the
+ columns, not by looking up the position of each column based on its name. Parquet files produced outside
+ of Impala must write column data in the same order as the columns are declared in the Impala table. Any
+ optional columns that are omitted from the data files must be the rightmost columns in the Impala table
+ definition.
+ </p>
+
+ <p>
+ If you created compressed Parquet files through some tool other than Impala, make sure that any
+ compression codecs are supported in Parquet by Impala. For example, Impala does not currently support LZO
+ compression in Parquet files. Also doublecheck that you used any recommended compatibility settings in
+ the other tool, such as <codeph>spark.sql.parquet.binaryAsString</codeph> when writing Parquet files
+ through Spark.
+ </p>
+ </note>
+
+ <p id="text_blurb">
+ <b>Text table considerations:</b>
+ </p>
+
+ <p id="text_bulky">
+ <b>Text table considerations:</b> Values of this type are potentially larger in text tables than in tables
+ using Parquet or other binary formats.
+ </p>
+
+ <p id="schema_evolution_blurb">
+ <b>Schema evolution considerations:</b>
+ </p>
+
+ <p id="column_stats_blurb">
+ <b>Column statistics considerations:</b>
+ </p>
+
+ <p id="column_stats_constant">
+ <b>Column statistics considerations:</b> Because this type has a fixed size, the maximum and average size
+ fields are always filled in for column statistics, even before you run the <codeph>COMPUTE STATS</codeph>
+ statement.
+ </p>
+
+ <p id="column_stats_variable">
+ <b>Column statistics considerations:</b> Because the values of this type have variable size, none of the
+ column statistics fields are filled in until you run the <codeph>COMPUTE STATS</codeph> statement.
+ </p>
+
+ <p id="usage_notes_blurb">
+ <b>Usage notes:</b>
+ </p>
+
+ <p id="example_blurb">
+ <b>Examples:</b>
+ </p>
+
+ <p id="result_set_blurb">
+ <b>Result set:</b>
+ </p>
+
+ <p id="jdbc_blurb">
+ <b>JDBC and ODBC considerations:</b>
+ </p>
+
+ <p id="cancel_blurb_no">
+ <b>Cancellation:</b> Cannot be cancelled.
+ </p>
+
+ <p id="cancel_blurb_yes">
+ <b>Cancellation:</b> Can be cancelled. To cancel this statement, use Ctrl-C from the
+ <cmdname>impala-shell</cmdname> interpreter, the <uicontrol>Cancel</uicontrol> button from the
+ <uicontrol>Watch</uicontrol> page in Hue, <uicontrol>Actions > Cancel</uicontrol> from the
+ <uicontrol>Queries</uicontrol> list in Cloudera Manager, or <uicontrol>Cancel</uicontrol> from the list of
+ in-flight queries (for a particular node) on the <uicontrol>Queries</uicontrol> tab in the Impala web UI
+ (port 25000).
+ </p>
+
+ <p id="cancel_blurb_maybe">
+ <b>Cancellation:</b> Certain multi-stage statements (<codeph>CREATE TABLE AS SELECT</codeph> and
+ <codeph>COMPUTE STATS</codeph>) can be cancelled during some stages, when running <codeph>INSERT</codeph>
+ or <codeph>SELECT</codeph> operations internally. To cancel this statement, use Ctrl-C from the
+ <cmdname>impala-shell</cmdname> interpreter, the <uicontrol>Cancel</uicontrol> button from the
+ <uicontrol>Watch</uicontrol> page in Hue, <uicontrol>Actions > Cancel</uicontrol> from the
+ <uicontrol>Queries</uicontrol> list in Cloudera Manager, or <uicontrol>Cancel</uicontrol> from the list of
+ in-flight queries (for a particular node) on the <uicontrol>Queries</uicontrol> tab in the Impala web UI
+ (port 25000).
+ </p>
+
+ <p id="partitioning_blurb">
+ <b>Partitioning:</b>
+ </p>
+
+ <p id="partitioning_good">
+ <b>Partitioning:</b> Prefer to use this type for a partition key column. Impala can process the numeric
+ type more efficiently than a <codeph>STRING</codeph> representation of the value.
+ </p>
+
+ <p id="partitioning_bad">
+ <b>Partitioning:</b> This type can be used for partition key columns. Because of the efficiency advantage
+ of numeric values over character-based values, if the partition key is a string representation of a number,
+ prefer to use an integer type with sufficient range (<codeph>INT</codeph>, <codeph>BIGINT</codeph>, and so
+ on) where practical.
+ </p>
+
+ <p id="partitioning_silly">
+ <b>Partitioning:</b> Because this type has so few distinct values, it is typically not a sensible choice
+ for a partition key column.
+ </p>
+
+ <p id="partitioning_imprecise">
+ <b>Partitioning:</b> Because fractional values of this type are not always represented precisely, when this
+ type is used for a partition key column, the underlying HDFS directories might not be named exactly as you
+ expect. Prefer to partition on a <codeph>DECIMAL</codeph> column instead.
+ </p>
+
+ <p id="partitioning_worrisome">
+ <b>Partitioning:</b> Because this type potentially has so many distinct values, it is often not a sensible
+ choice for a partition key column. For example, events 1 millisecond apart would be stored in different
+ partitions. Consider using the <codeph>TRUNC()</codeph> function to condense the number of distinct values,
+ and partition on a new column with the truncated values.
+ </p>
+
+ <p id="hdfs_blurb">
+ <b>HDFS considerations:</b>
+ </p>
+
+ <p id="file_format_blurb">
+ <b>File format considerations:</b>
+ </p>
+
+ <p id="s3_blurb" rev="2.2.0">
+ <b>Amazon S3 considerations:</b>
+ </p>
+
+ <p id="isilon_blurb" rev="5.4.3">
+ <b>Isilon considerations:</b>
+ </p>
+ <p id="isilon_block_size_caveat" rev="5.4.3">
+ Because the EMC Isilon storage devices use a global value for the block size
+ rather than a configurable value for each file, the <codeph>PARQUET_FILE_SIZE</codeph>
+ query option has no effect when Impala inserts data into a table or partition
+ residing on Isilon storage. Use the <codeph>isi</codeph> command to set the
+ default block size globally on the Isilon device. For example, to set the
+ Isilon default block size to 256 MB, the recommended size for Parquet
+ data files for Impala, issue the following command:
+<codeblock>isi hdfs settings modify --default-block-size=256MB</codeblock>
+ </p>
+
+ <p id="hbase_blurb">
+ <b>HBase considerations:</b>
+ </p>
+
+ <p id="hbase_ok">
+ <b>HBase considerations:</b> This data type is fully compatible with HBase tables.
+ </p>
+
+ <p id="hbase_no">
+ <b>HBase considerations:</b> This data type cannot be used with HBase tables.
+ </p>
+
+ <p id="internals_blurb">
+ <b>Internal details:</b>
+ </p>
+
+ <p id="internals_1_bytes">
+ <b>Internal details:</b> Represented in memory as a 1-byte value.
+ </p>
+
+ <p id="internals_2_bytes">
+ <b>Internal details:</b> Represented in memory as a 2-byte value.
+ </p>
+
+ <p id="internals_4_bytes">
+ <b>Internal details:</b> Represented in memory as a 4-byte value.
+ </p>
+
+ <p id="internals_8_bytes">
+ <b>Internal details:</b> Represented in memory as an 8-byte value.
+ </p>
+
+ <p id="internals_16_bytes">
+ <b>Internal details:</b> Represented in memory as a 16-byte value.
+ </p>
+
+ <p id="internals_max_bytes">
+ <b>Internal details:</b> Represented in memory as a byte array with the same size as the length
+ specification. Values that are shorter than the specified length are padded on the right with trailing
+ spaces.
+ </p>
+
+ <p id="internals_min_bytes">
+ <b>Internal details:</b> Represented in memory as a byte array with the minimum size needed to represent
+ each value.
+ </p>
+
+ <p rev="2.3.0" id="added_in_230">
+ <b>Added in:</b> CDH 5.5.0 (Impala 2.3.0)
+ </p>
+
+ <p rev="2.0.0" id="added_in_20">
+ <b>Added in:</b> CDH 5.2.0 (Impala 2.0.0)
+ </p>
+
+ <p rev="2.0.0" id="enhanced_in_20">
+ <b>Added in:</b> Available in earlier Impala releases, but new capabilities were added in
+ CDH 5.2.0 / Impala 2.0.0
+ </p>
+
+ <p id="added_forever">
+ <b>Added in:</b> Available in all versions of Impala.
+ </p>
+
+ <p id="added_in_140">
+ <b>Added in:</b> Impala 1.4.0
+ </p>
+
+ <p id="added_in_130">
+ <b>Added in:</b> Impala 1.3.0
+ </p>
+
+ <p id="added_in_11">
+ <b>Added in:</b> Impala 1.1
+ </p>
+
+ <p id="added_in_111">
+ <b>Added in:</b> Impala 1.1.1
+ </p>
+
+ <p id="added_in_210">
+ <b>Added in:</b> CDH 5.3.0 (Impala 2.1.0)
+ </p>
+
+ <p id="added_in_220">
+ <b>Added in:</b> CDH 5.4.0 (Impala 2.2.0)
+ </p>
+
+ <p id="syntax_blurb">
+ <b>Syntax:</b>
+ </p>
+
+ <p id="disk_space_blurb">
+ For other tips about managing and reclaiming Impala disk space, see
+ <xref href="../topics/impala_disk_space.xml#disk_space"/>.
+ </p>
+
+ <p id="join_types">
+ Impala supports a wide variety of <codeph>JOIN</codeph> clauses. Left, right, semi, full, and outer joins
+ are supported in all Impala versions. The <codeph>CROSS JOIN</codeph> operator is available in Impala 1.2.2
+ and higher. During performance tuning, you can override the reordering of join clauses that Impala does
+ internally by including the keyword <codeph>STRAIGHT_JOIN</codeph> immediately after the
+ <codeph>SELECT</codeph> keyword
+ </p>
+
+ <p id="catalog_server_124">
+ In Impala 1.2.4 and higher, you can specify a table name with <codeph>INVALIDATE METADATA</codeph> after
+ the table is created in Hive, allowing you to make individual tables visible to Impala without doing a full
+ reload of the catalog metadata. Impala 1.2.4 also includes other changes to make the metadata broadcast
+ mechanism faster and more responsive, especially during Impala startup. See
+ <xref href="../topics/impala_new_features.xml#new_features_124"/> for details.
+ </p>
+
+ <p id="explain_interpret">
+ Read the <codeph>EXPLAIN</codeph> plan from bottom to top:
+ <ul>
+ <li>
+ The last part of the plan shows the low-level details such as the expected amount of data that will be
+ read, where you can judge the effectiveness of your partitioning strategy and estimate how long it will
+ take to scan a table based on total data size and the size of the cluster.
+ </li>
+
+ <li>
+ As you work your way up, next you see the operations that will be parallelized and performed on each
+ Impala node.
+ </li>
+
+ <li>
+ At the higher levels, you see how data flows when intermediate result sets are combined and transmitted
+ from one node to another.
+ </li>
+
+ <li>
+ See <xref href="../topics/impala_explain_level.xml#explain_level"/> for details about the
+ <codeph>EXPLAIN_LEVEL</codeph> query option, which lets you customize how much detail to show in the
+ <codeph>EXPLAIN</codeph> plan depending on whether you are doing high-level or low-level tuning,
+ dealing with logical or physical aspects of the query.
+ </li>
+ </ul>
+ </p>
+
+<!-- This sequence of paragraph + codeblock + paragraph is typically referenced in sequence wherever it's reused. -->
+
+ <p id="aggr1">
+ Aggregate functions are a special category with different rules. These functions calculate a return value
+ across all the items in a result set, so they require a <codeph>FROM</codeph> clause in the query:
+ </p>
+
+<codeblock id="aggr2" xml:space="preserve">select count(product_id) from product_catalog;
+select max(height), avg(height) from census_data where age > 20;
+</codeblock>
+
+ <p id="aggr3">
+ Aggregate functions also ignore <codeph>NULL</codeph> values rather than returning a <codeph>NULL</codeph>
+ result. For example, if some rows have <codeph>NULL</codeph> for a particular column, those rows are
+ ignored when computing the <codeph>AVG()</codeph> for that column. Likewise, specifying
+ <codeph>COUNT(<varname>col_name</varname>)</codeph> in a query counts only those rows where
+ <varname>col_name</varname> contains a non-<codeph>NULL</codeph> value.
+ </p>
+
+ <p>
+ <ph id="aliases_vs_identifiers"> Aliases follow the same rules as identifiers when it comes to case
+ insensitivity. Aliases can be longer than identifiers (up to the maximum length of a Java string) and can
+ include additional characters such as spaces and dashes when they are quoted using backtick characters.
+ </ph>
+ </p>
+
+ <p id="views_vs_identifiers">
+ Another way to define different names for the same tables or columns is to create views. See
+ <xref href="../topics/impala_views.xml#views"/> for details.
+ </p>
+
+ <p id="insert_hints" rev="1.2.2">
+ When inserting into partitioned tables, especially using the Parquet file format, you can include a hint in
+ the <codeph>INSERT</codeph> statement to fine-tune the overall performance of the operation and its
+ resource usage:
+ <ul>
+ <li>
+ These hints are available in Impala 1.2.2 and higher.
+ </li>
+
+ <li>
+ You would only use these hints if an <codeph>INSERT</codeph> into a partitioned Parquet table was
+ failing due to capacity limits, or if such an <codeph>INSERT</codeph> was succeeding but with
+ less-than-optimal performance.
+ </li>
+
+ <li>
+ To use these hints, put the hint keyword <codeph>[SHUFFLE]</codeph> or <codeph>[NOSHUFFLE]</codeph>
+ (including the square brackets) after the <codeph>PARTITION</codeph> clause, immediately before the
+ <codeph>SELECT</codeph> keyword.
+ </li>
+
+ <li>
+ <codeph>[SHUFFLE]</codeph> selects an execution plan that minimizes the number of files being written
+ simultaneously to HDFS, and the number of memory buffers holding data for individual partitions. Thus
+ it reduces overall resource usage for the <codeph>INSERT</codeph> operation, allowing some
+ <codeph>INSERT</codeph> operations to succeed that otherwise would fail. It does involve some data
+ transfer between the nodes so that the data files for a particular partition are all constructed on the
+ same node.
+ </li>
+
+ <li>
+ <codeph>[NOSHUFFLE]</codeph> selects an execution plan that might be faster overall, but might also
+ produce a larger number of small data files or exceed capacity limits, causing the
+ <codeph>INSERT</codeph> operation to fail. Use <codeph>[SHUFFLE]</codeph> in cases where an
+ <codeph>INSERT</codeph> statement fails or runs inefficiently due to all nodes attempting to construct
+ data for all partitions.
+ </li>
+
+ <li>
+ Impala automatically uses the <codeph>[SHUFFLE]</codeph> method if any partition key column in the
+ source table, mentioned in the <codeph>INSERT ... SELECT</codeph> query, does not have column
+ statistics. In this case, only the <codeph>[NOSHUFFLE]</codeph> hint would have any effect.
+ </li>
+
+ <li>
+ If column statistics are available for all partition key columns in the source table mentioned in the
+ <codeph>INSERT ... SELECT</codeph> query, Impala chooses whether to use the <codeph>[SHUFFLE]</codeph>
+ or <codeph>[NOSHUFFLE]</codeph> technique based on the estimated number of distinct values in those
+ columns and the number of nodes involved in the <codeph>INSERT</codeph> operation. In this case, you
+ might need the <codeph>[SHUFFLE]</codeph> or the <codeph>[NOSHUFFLE]</codeph> hint to override the
+ execution plan selected by Impala.
+ </li>
+ </ul>
+ </p>
+
+ <p id="insert_parquet_blocksize">
+ Any <codeph>INSERT</codeph> statement for a Parquet table requires enough free space in the HDFS filesystem
+ to write one block. Because Parquet data files use a block size of 1 GB by default, an
+ <codeph>INSERT</codeph> might fail (even for a very small amount of data) if your HDFS is running low on
+ space.
+ </p>
+
+ <note id="compute_stats_next" type="important">
+ After adding or replacing data in a table used in performance-critical queries, issue a <codeph>COMPUTE
+ STATS</codeph> statement to make sure all statistics are up-to-date. Consider updating statistics for a
+ table after any <codeph>INSERT</codeph>, <codeph>LOAD DATA</codeph>, or <codeph>CREATE TABLE AS
+ SELECT</codeph> statement in Impala, or after loading data through Hive and doing a <codeph>REFRESH
+ <varname>table_name</varname></codeph> in Impala. This technique is especially important for tables that
+ are very large, used in join queries, or both.
+ </note>
+
+ <p id="concat_blurb">
+ <b>Usage notes:</b> <codeph>concat()</codeph> and <codeph>concat_ws()</codeph> are appropriate for
+ concatenating the values of multiple columns within the same row, while <codeph>group_concat()</codeph>
+ joins together values from different rows.
+ </p>
+
+ <p id="null_sorting_change">
+ In Impala 1.2.1 and higher, all <codeph>NULL</codeph> values come at the end of the result set for
+ <codeph>ORDER BY ... ASC</codeph> queries, and at the beginning of the result set for <codeph>ORDER BY ...
+ DESC</codeph> queries. In effect, <codeph>NULL</codeph> is considered greater than all other values for
+ sorting purposes. The original Impala behavior always put <codeph>NULL</codeph> values at the end, even for
+ <codeph>ORDER BY ... DESC</codeph> queries. The new behavior in Impala 1.2.1 makes Impala more compatible
+ with other popular database systems. In Impala 1.2.1 and higher, you can override or specify the sorting
+ behavior for <codeph>NULL</codeph> by adding the clause <codeph>NULLS FIRST</codeph> or <codeph>NULLS
+ LAST</codeph> at the end of the <codeph>ORDER BY</codeph> clause.
+ </p>
+
+ <p id="return_same_type">
+ <b>Return type:</b> same as the initial argument value, except that integer values are promoted to
+ <codeph>BIGINT</codeph> and floating-point values are promoted to <codeph>DOUBLE</codeph>; use
+ <codeph>CAST()</codeph> when inserting into a smaller numeric column
+ </p>
+
+ <p id="ddl_blurb">
+ <b>Statement type:</b> DDL
+ </p>
+
+ <p id="dml_blurb">
+ <b>Statement type:</b> DML (but still affected by
+ <xref href="../topics/impala_sync_ddl.xml#sync_ddl">SYNC_DDL</xref> query option)
+ </p>
+
+ <p rev="1.2" id="sync_ddl_blurb">
+ If you connect to different Impala nodes within an <cmdname>impala-shell</cmdname> session for
+ load-balancing purposes, you can enable the <codeph>SYNC_DDL</codeph> query option to make each DDL
+ statement wait before returning, until the new or changed metadata has been received by all the Impala
+ nodes. See <xref href="../topics/impala_sync_ddl.xml#sync_ddl"/> for details.
+ </p>
+
+<!-- Boost no longer used in Impala 2.0 and later, so this conref is no longer referenced anywhere. -->
+
+ <p id="regexp_boost">
+ The Impala regular expression syntax conforms to the POSIX Extended Regular Expression syntax used by the
+ Boost library. For details, see
+ <xref href="http://www.boost.org/doc/libs/1_46_0/libs/regex/doc/html/boost_regex/syntax/basic_extended.html" scope="external" format="html">the
+ Boost documentation</xref>. It has most idioms familiar from regular expressions in Perl, Python, and so
+ on. It does not support <codeph>.*?</codeph> for non-greedy matches.
+ </p>
+
+ <p rev="2.0.0" id="regexp_re2">
+ In Impala 2.0 and later, the Impala regular expression syntax conforms to the POSIX Extended Regular
+ Expression syntax used by the Google RE2 library. For details, see
+ <xref href="https://code.google.com/p/re2/" scope="external" format="html">the RE2 documentation</xref>. It
+ has most idioms familiar from regular expressions in Perl, Python, and so on, including
+ <codeph>.*?</codeph> for non-greedy matches.
+ </p>
+
+ <p rev="2.0.0" id="regexp_re2_warning">
+ In Impala 2.0 and later, a change in the underlying regular expression library could cause changes in the
+ way regular expressions are interpreted by this function. Test any queries that use regular expressions and
+ adjust the expression patterns if necessary. See
+ <xref href="../topics/impala_incompatible_changes.xml#incompatible_changes_200"/> for details.
+ </p>
+
+ <p id="regexp_escapes">
+ Because the <cmdname>impala-shell</cmdname> interpreter uses the <codeph>\</codeph> character for escaping,
+ use <codeph>\\</codeph> to represent the regular expression escape character in any regular expressions
+ that you submit through <cmdname>impala-shell</cmdname> . You might prefer to use the equivalent character
+ class names, such as <codeph>[[:digit:]]</codeph> instead of <codeph>\d</codeph> which you would have to
+ escape
<TRUNCATED>
[20/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Posted by jr...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_abort_on_default_limit_exceeded.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_abort_on_default_limit_exceeded.xml b/docs/topics/impala_abort_on_default_limit_exceeded.xml
new file mode 100644
index 0000000..c58be63
--- /dev/null
+++ b/docs/topics/impala_abort_on_default_limit_exceeded.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="obwl" id="abort_on_default_limit_exceeded">
+
+ <title>ABORT_ON_DEFAULT_LIMIT_EXCEEDED Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p conref="../shared/impala_common.xml#common/obwl_query_options"/>
+
+ <p conref="../shared/impala_common.xml#common/type_boolean"/>
+ <p conref="../shared/impala_common.xml#common/default_false_0"/>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_abort_on_error.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_abort_on_error.xml b/docs/topics/impala_abort_on_error.xml
new file mode 100644
index 0000000..1926333
--- /dev/null
+++ b/docs/topics/impala_abort_on_error.xml
@@ -0,0 +1,40 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="abort_on_error">
+
+ <title>ABORT_ON_ERROR Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ <data name="Category" value="Troubleshooting"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">ABORT_ON_ERROR query option</indexterm>
+ When this option is enabled, Impala cancels a query immediately when any of the nodes encounters an error,
+ rather than continuing and possibly returning incomplete results. This option is disabled by default, to help
+ gather maximum diagnostic information when an error occurs, for example, whether the same problem occurred on
+ all nodes or only a single node. Currently, the errors that Impala can skip over involve data corruption,
+ such as a column that contains a string value when expected to contain an integer value.
+ </p>
+
+ <p>
+ To control how much logging Impala does for non-fatal errors when <codeph>ABORT_ON_ERROR</codeph> is turned
+ off, use the <codeph>MAX_ERRORS</codeph> option.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/type_boolean"/>
+ <p conref="../shared/impala_common.xml#common/default_false_0"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+ <p>
+ <xref href="impala_max_errors.xml#max_errors"/>,
+ <xref href="impala_logging.xml#logging"/>
+ </p>
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_aggregate_functions.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_aggregate_functions.xml b/docs/topics/impala_aggregate_functions.xml
new file mode 100644
index 0000000..5095266
--- /dev/null
+++ b/docs/topics/impala_aggregate_functions.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="aggregate_functions">
+
+ <title>Impala Aggregate Functions</title>
+ <titlealts><navtitle>Aggregate Functions</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="Aggregate Functions"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p conref="../shared/impala_common.xml#common/aggr1"/>
+
+<codeblock conref="../shared/impala_common.xml#common/aggr2"/>
+
+ <p conref="../shared/impala_common.xml#common/aggr3"/>
+
+ <p>
+ <indexterm audience="Cloudera">aggregate functions</indexterm>
+ </p>
+
+ <p outputclass="toc"/>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_aliases.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_aliases.xml b/docs/topics/impala_aliases.xml
new file mode 100644
index 0000000..66a16fe
--- /dev/null
+++ b/docs/topics/impala_aliases.xml
@@ -0,0 +1,71 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="aliases">
+
+ <title>Overview of Impala Aliases</title>
+ <titlealts><navtitle>Aliases</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Tables"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ When you write the names of tables, columns, or column expressions in a query, you can assign an alias at the
+ same time. Then you can specify the alias rather than the original name when making other references to the
+ table or column in the same statement. You typically specify aliases that are shorter, easier to remember, or
+ both than the original names. The aliases are printed in the query header, making them useful for
+ self-documenting output.
+ </p>
+
+ <p>
+ To set up an alias, add the <codeph>AS <varname>alias</varname></codeph> clause immediately after any table,
+ column, or expression name in the <codeph>SELECT</codeph> list or <codeph>FROM</codeph> list of a query. The
+ <codeph>AS</codeph> keyword is optional; you can also specify the alias immediately after the original name.
+ </p>
+
+ <p>
+ To use an alias name that matches one of the Impala reserved keywords (listed in
+ <xref href="impala_reserved_words.xml#reserved_words"/>), surround the identifier with either single or
+ double quotation marks, or <codeph>``</codeph> characters (backticks).
+ </p>
+
+<codeblock>select c1 as name, c2 as address, c3 as phone from table_with_terse_columns;
+select sum(ss_xyz_dollars_net) as total_sales from table_with_cryptic_columns;
+select one.name, two.address, three.phone from
+ census one, building_directory two, phonebook three
+ where one.id = two.id and two.id = three.id;</codeblock>
+
+ <p>
+ <ph conref="../shared/impala_common.xml#common/aliases_vs_identifiers"/>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p rev="2.3.0">
+ Queries involving the complex types (<codeph>ARRAY</codeph>,
+ <codeph>STRUCT</codeph>, and <codeph>MAP</codeph>), typically make
+ extensive use of table aliases. These queries involve join clauses
+ where the complex type column is treated as a joined table.
+ To construct two-part or three-part qualified names for the
+ complex column elements in the <codeph>FROM</codeph> list,
+ sometimes it is syntactically required to construct a table
+ alias for the complex column where it is referenced in the join clause.
+ See <xref href="impala_complex_types.xml#complex_types"/> for details and examples.
+ </p>
+
+ <p>
+ <b>Alternatives:</b>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/views_vs_identifiers"/>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_allow_unsupported_formats.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_allow_unsupported_formats.xml b/docs/topics/impala_allow_unsupported_formats.xml
new file mode 100644
index 0000000..824daa4
--- /dev/null
+++ b/docs/topics/impala_allow_unsupported_formats.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="allow_unsupported_formats">
+
+ <title>ALLOW_UNSUPPORTED_FORMATS Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+<!--
+The original brief explanation with not enough detail comes from the comments at:
+ http://github.sf.cloudera.com/CDH/Impala/raw/master/common/thrift/ImpalaService.thrift
+Removing that wording from here after discussions with dev team. Just recording the URL for posterity.
+-->
+
+ <p>
+ An obsolete query option from early work on support for file formats. Do not use. Might be removed in the
+ future.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/type_boolean"/>
+ <p conref="../shared/impala_common.xml#common/default_false_0"/>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_alter_table.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_alter_table.xml b/docs/topics/impala_alter_table.xml
new file mode 100644
index 0000000..800261a
--- /dev/null
+++ b/docs/topics/impala_alter_table.xml
@@ -0,0 +1,411 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="alter_table">
+
+ <title>ALTER TABLE Statement</title>
+ <titlealts><navtitle>ALTER TABLE</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Data Types"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="DDL"/>
+ <data name="Category" value="HDFS Caching"/>
+ <data name="Category" value="Tables"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">ALTER TABLE statement</indexterm>
+ The <codeph>ALTER TABLE</codeph> statement changes the structure or properties of an existing table. In
+ Impala, this is a logical operation that updates the table metadata in the metastore database that Impala
+ shares with Hive; <codeph>ALTER TABLE</codeph> does not actually rewrite, move, and so on the actual data
+ files. Thus, you might need to perform corresponding physical filesystem operations, such as moving data
+ files to a different HDFS directory, rewriting the data files to include extra fields, or converting them to
+ a different file format.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>ALTER TABLE [<varname>old_db_name</varname>.]<varname>old_table_name</varname> RENAME TO [<varname>new_db_name</varname>.]<varname>new_table_name</varname>
+
+ALTER TABLE <varname>name</varname> ADD COLUMNS (<varname>col_spec</varname>[, <varname>col_spec</varname> ...])
+ALTER TABLE <varname>name</varname> DROP [COLUMN] <varname>column_name</varname>
+ALTER TABLE <varname>name</varname> CHANGE <varname>column_name</varname> <varname>new_name</varname> <varname>new_type</varname>
+ALTER TABLE <varname>name</varname> REPLACE COLUMNS (<varname>col_spec</varname>[, <varname>col_spec</varname> ...])
+
+ALTER TABLE <varname>name</varname> { ADD | DROP } PARTITION (<varname>partition_spec</varname>) <ph rev="2.3.0">[PURGE]</ph>
+
+ALTER TABLE <varname>name</varname> [PARTITION (<varname>partition_spec</varname>)]
+ SET { FILEFORMAT <varname>file_format</varname>
+ | LOCATION '<varname>hdfs_path_of_directory</varname>'
+ | TBLPROPERTIES (<varname>table_properties</varname>)
+ | SERDEPROPERTIES (<varname>serde_properties</varname>) }
+
+<ph rev="1.4.0">ALTER TABLE <varname>name</varname> [PARTITION (<varname>partition_spec</varname>)] SET { CACHED IN '<varname>pool_name</varname>' <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED }</ph>
+
+<varname>new_name</varname> ::= [<varname>new_database</varname>.]<varname>new_table_name</varname>
+
+<varname>col_spec</varname> ::= <varname>col_name</varname> <varname>type_name</varname>
+
+<varname>partition_spec</varname> ::= <varname>partition_col</varname>=<varname>constant_value</varname>
+
+<varname>table_properties</varname> ::= '<varname>name</varname>'='<varname>value</varname>'[, '<varname>name</varname>'='<varname>value</varname>' ...]
+
+<varname>serde_properties</varname> ::= '<varname>name</varname>'='<varname>value</varname>'[, '<varname>name</varname>'='<varname>value</varname>' ...]
+
+<varname>file_format</varname> ::= { PARQUET | TEXTFILE | RCFILE | SEQUENCEFILE | AVRO }
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/ddl_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p rev="2.3.0">
+ In CDH 5.5 / Impala 2.3 and higher, the <codeph>ALTER TABLE</codeph> statement can
+ change the metadata for tables containing complex types (<codeph>ARRAY</codeph>,
+ <codeph>STRUCT</codeph>, and <codeph>MAP</codeph>).
+ For example, you can use an <codeph>ADD COLUMNS</codeph>, <codeph>DROP COLUMN</codeph>, or <codeph>CHANGE</codeph>
+ clause to modify the table layout for complex type columns.
+ Although Impala queries only work for complex type columns in Parquet tables, the complex type support in the
+ <codeph>ALTER TABLE</codeph> statement applies to all file formats.
+ For example, you can use Impala to update metadata for a staging table in a non-Parquet file format where the
+ data is populated by Hive. Or you can use <codeph>ALTER TABLE SET FILEFORMAT</codeph> to change the format
+ of an existing table to Parquet so that Impala can query it. (Remember that changing the file format for a table does
+ not convert the data files within the table; you must prepare any Parquet data files containing complex types
+ outside Impala, and bring them into the table using <codeph>LOAD DATA</codeph> or updating the table's
+ <codeph>LOCATION</codeph> property.
+ See <xref href="impala_complex_types.xml#complex_types"/> for details about using complex types.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ Whenever you specify partitions in an <codeph>ALTER TABLE</codeph> statement, through the <codeph>PARTITION
+ (<varname>partition_spec</varname>)</codeph> clause, you must include all the partitioning columns in the
+ specification.
+ </p>
+
+ <p>
+ Most of the <codeph>ALTER TABLE</codeph> operations work the same for internal tables (managed by Impala) as
+ for external tables (with data files located in arbitrary locations). The exception is renaming a table; for
+ an external table, the underlying data directory is not renamed or moved.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/s3_blurb"/>
+ <p rev="2.2.0">
+ You can specify an <codeph>s3a://</codeph> prefix in the <codeph>LOCATION</codeph> attribute of a table or partition
+ to make Impala query data from the Amazon S3 filesystem.
+ See <xref href="impala_s3.xml#s3"/> for details.
+ </p>
+
+ <p rev="1.4.0">
+ <b>HDFS caching (CACHED IN clause):</b>
+ </p>
+
+ <p rev="1.4.0">
+ If you specify the <codeph>CACHED IN</codeph> clause, any existing or future data files in the table
+ directory or the partition subdirectories are designated to be loaded into memory with the HDFS caching
+ mechanism. See <xref href="impala_perf_hdfs_caching.xml#hdfs_caching"/> for details about using the HDFS
+ caching feature.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/impala_cache_replication_factor"/>
+
+ <p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/>
+
+ <p>
+ The following sections show examples of the use cases for various <codeph>ALTER TABLE</codeph> clauses.
+ </p>
+
+ <p>
+ <b>To rename a table (RENAME TO clause):</b>
+ </p>
+
+<!-- Beefing up the syntax in its original location up to, don't need to repeat it here.
+<codeblock>ALTER TABLE <varname>old_name</varname> RENAME TO <varname>new_name</varname>;</codeblock>
+-->
+
+ <p>
+ The <codeph>RENAME TO</codeph> clause lets you change the name of an existing table, and optionally which
+ database it is located in.
+ </p>
+
+ <p>
+ For internal tables, his operation physically renames the directory within HDFS that contains the data files;
+ the original directory name no longer exists. By qualifying the table names with database names, you can use
+ this technique to move an internal table (and its associated data directory) from one database to another.
+ For example:
+ </p>
+
+<codeblock>create database d1;
+create database d2;
+create database d3;
+use d1;
+create table mobile (x int);
+use d2;
+-- Move table from another database to the current one.
+alter table d1.mobile rename to mobile;
+use d1;
+-- Move table from one database to another.
+alter table d2.mobile rename to d3.mobile;</codeblock>
+
+ <p>
+ For external tables,
+ </p>
+
+ <p>
+ <b>To change the physical location where Impala looks for data files associated with a table or
+ partition:</b>
+ </p>
+
+<codeblock>ALTER TABLE <varname>table_name</varname> [PARTITION (<varname>partition_spec</varname>)] SET LOCATION '<varname>hdfs_path_of_directory</varname>';</codeblock>
+
+ <p>
+ The path you specify is the full HDFS path where the data files reside, or will be created. Impala does not
+ create any additional subdirectory named after the table. Impala does not move any data files to this new
+ location or change any data files that might already exist in that directory.
+ </p>
+
+ <p>
+ To set the location for a single partition, include the <codeph>PARTITION</codeph> clause. Specify all the
+ same partitioning columns for the table, with a constant value for each, to precisely identify the single
+ partition affected by the statement:
+ </p>
+
+<codeblock>create table p1 (s string) partitioned by (month int, day int);
+-- Each ADD PARTITION clause creates a subdirectory in HDFS.
+alter table p1 add partition (month=1, day=1);
+alter table p1 add partition (month=1, day=2);
+alter table p1 add partition (month=2, day=1);
+alter table p1 add partition (month=2, day=2);
+-- Redirect queries, INSERT, and LOAD DATA for one partition
+-- to a specific different directory.
+alter table p1 partition (month=1, day=1) set location '/usr/external_data/new_years_day';
+</codeblock>
+
+ <note conref="../shared/impala_common.xml#common/add_partition_set_location"/>
+
+ <p rev="1.2">
+ <b>To change the key-value pairs of the TBLPROPERTIES and SERDEPROPERTIES fields:</b>
+ </p>
+
+<codeblock>ALTER TABLE <varname>table_name</varname> SET TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>'[, ...]);
+ALTER TABLE <varname>table_name</varname> SET SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>'[, ...]);</codeblock>
+
+ <p>
+ The <codeph>TBLPROPERTIES</codeph> clause is primarily a way to associate arbitrary user-specified data items
+ with a particular table.
+ </p>
+
+ <p>
+ The <codeph>SERDEPROPERTIES</codeph> clause sets up metadata defining how tables are read or written, needed
+ in some cases by Hive but not used extensively by Impala. You would use this clause primarily to change the
+ delimiter in an existing text table or partition, by setting the <codeph>'serialization.format'</codeph> and
+ <codeph>'field.delim'</codeph> property values to the new delimiter character:
+ </p>
+
+<codeblock>-- This table begins life as pipe-separated text format.
+create table change_to_csv (s1 string, s2 string) row format delimited fields terminated by '|';
+-- Then we change it to a CSV table.
+alter table change_to_csv set SERDEPROPERTIES ('serialization.format'=',', 'field.delim'=',');
+insert overwrite change_to_csv values ('stop','go'), ('yes','no');
+!hdfs dfs -cat 'hdfs://<varname>hostname</varname>:8020/<varname>data_directory</varname>/<varname>dbname</varname>.db/change_to_csv/<varname>data_file</varname>';
+stop,go
+yes,no</codeblock>
+
+ <p>
+ Use the <codeph>DESCRIBE FORMATTED</codeph> statement to see the current values of these properties for an
+ existing table. See <xref href="impala_create_table.xml#create_table"/> for more details about these clauses.
+ See <xref href="impala_perf_stats.xml#perf_stats_manual"/> for an example of using table properties to
+ fine-tune the performance-related table statistics.
+ </p>
+
+ <p>
+ <b>To reorganize columns for a table:</b>
+ </p>
+
+<codeblock>ALTER TABLE <varname>table_name</varname> ADD COLUMNS (<varname>column_defs</varname>);
+ALTER TABLE <varname>table_name</varname> REPLACE COLUMNS (<varname>column_defs</varname>);
+ALTER TABLE <varname>table_name</varname> CHANGE <varname>column_name</varname> <varname>new_name</varname> <varname>new_type</varname>;
+ALTER TABLE <varname>table_name</varname> DROP <varname>column_name</varname>;</codeblock>
+
+ <p>
+ The <varname>column_spec</varname> is the same as in the <codeph>CREATE TABLE</codeph> statement: the column
+ name, then its data type, then an optional comment. You can add multiple columns at a time. The parentheses
+ are required whether you add a single column or multiple columns. When you replace columns, all the original
+ column definitions are discarded. You might use this technique if you receive a new set of data files with
+ different data types or columns in a different order. (The data files are retained, so if the new columns are
+ incompatible with the old ones, use <codeph>INSERT OVERWRITE</codeph> or <codeph>LOAD DATA OVERWRITE</codeph>
+ to replace all the data before issuing any further queries.)
+ </p>
+
+ <p>
+ You might use the <codeph>CHANGE</codeph> clause to rename a single column, or to treat an existing column as
+ a different type than before, such as to switch between treating a column as <codeph>STRING</codeph> and
+ <codeph>TIMESTAMP</codeph>, or between <codeph>INT</codeph> and <codeph>BIGINT</codeph>. You can only drop a
+ single column at a time; to drop multiple columns, issue multiple <codeph>ALTER TABLE</codeph> statements, or
+ define the new set of columns with a single <codeph>ALTER TABLE ... REPLACE COLUMNS</codeph> statement.
+ </p>
+
+ <p>
+ <b>To change the file format that Impala expects data to be in, for a table or partition:</b>
+ </p>
+
+ <p>
+ Use an <codeph>ALTER TABLE ... SET FILEFORMAT</codeph> clause. You can include an optional <codeph>PARTITION
+ (<varname>col1</varname>=<varname>val1</varname>, <varname>col2</varname>=<varname>val2</varname>,
+ ...</codeph> clause so that the file format is changed for a specific partition rather than the entire table.
+ </p>
+
+ <p>
+ Because this operation only changes the table metadata, you must do any conversion of existing data using
+ regular Hadoop techniques outside of Impala. Any new data created by the Impala <codeph>INSERT</codeph>
+ statement will be in the new format. You cannot specify the delimiter for Text files; the data files must be
+ comma-delimited.
+<!-- Although Impala can read Avro tables
+ created through Hive, you cannot specify the Avro file format in an Impala
+ <codeph>ALTER TABLE</codeph> statement. -->
+ </p>
+
+ <p>
+ To set the file format for a single partition, include the <codeph>PARTITION</codeph> clause. Specify all the
+ same partitioning columns for the table, with a constant value for each, to precisely identify the single
+ partition affected by the statement:
+ </p>
+
+<codeblock>create table p1 (s string) partitioned by (month int, day int);
+-- Each ADD PARTITION clause creates a subdirectory in HDFS.
+alter table p1 add partition (month=1, day=1);
+alter table p1 add partition (month=1, day=2);
+alter table p1 add partition (month=2, day=1);
+alter table p1 add partition (month=2, day=2);
+-- Queries and INSERT statements will read and write files
+-- in this format for this specific partition.
+alter table p1 partition (month=2, day=2) set fileformat parquet;
+</codeblock>
+
+ <p>
+ <b>To add or drop partitions for a table</b>, the table must already be partitioned (that is, created with a
+ <codeph>PARTITIONED BY</codeph> clause). The partition is a physical directory in HDFS, with a name that
+ encodes a particular column value (the <b>partition key</b>). The Impala <codeph>INSERT</codeph> statement
+ already creates the partition if necessary, so the <codeph>ALTER TABLE ... ADD PARTITION</codeph> is
+ primarily useful for importing data by moving or copying existing data files into the HDFS directory
+ corresponding to a partition. (You can use the <codeph>LOAD DATA</codeph> statement to move files into the
+ partition directory, or <codeph>ALTER TABLE ... PARTITION (...) SET LOCATION</codeph> to point a partition at
+ a directory that already contains data files.
+ </p>
+
+ <p>
+ The <codeph>DROP PARTITION</codeph> clause is used to remove the HDFS directory and associated data files for
+ a particular set of partition key values; for example, if you always analyze the last 3 months worth of data,
+ at the beginning of each month you might drop the oldest partition that is no longer needed. Removing
+ partitions reduces the amount of metadata associated with the table and the complexity of calculating the
+ optimal query plan, which can simplify and speed up queries on partitioned tables, particularly join queries.
+ Here is an example showing the <codeph>ADD PARTITION</codeph> and <codeph>DROP PARTITION</codeph> clauses.
+ </p>
+
+ <p rev="2.3.0">
+ The optional <codeph>PURGE</codeph> keyword, available in CDH 5.5 / Impala 2.3 and higher,
+ is used with the <codeph>DROP PARTITION</codeph> clause to remove associated HDFS data files
+ immediately rather than going through the HDFS trashcan mechanism.
+ Use this keyword when dropping a partition if it is
+ crucial to remove the data as quickly as possible to free up space, or if there is a problem with
+ the trashcan, such as the trashcan not being configured or being in a different HDFS encryption zone
+ than the data files.
+ </p>
+
+ <draft-comment translate="no">
+ Make example more general by partitioning by year/month/day.
+ Then could show inserting into fixed year, variable month and day;
+ dropping particular year/month/day partition.
+ </draft-comment>
+
+<codeblock>-- Create an empty table and define the partitioning scheme.
+create table part_t (x int) partitioned by (month int);
+-- Create an empty partition into which you could copy data files from some other source.
+alter table part_t add partition (month=1);
+-- After changing the underlying data, issue a REFRESH statement to make the data visible in Impala.
+refresh part_t;
+-- Later, do the same for the next month.
+alter table part_t add partition (month=2);
+
+-- Now you no longer need the older data.
+alter table part_t drop partition (month=1);
+-- If the table was partitioned by month and year, you would issue a statement like:
+-- alter table part_t drop partition (year=2003,month=1);
+-- which would require 12 ALTER TABLE statements to remove a year's worth of data.
+
+-- If the data files for subsequent months were in a different file format,
+-- you could set a different file format for the new partition as you create it.
+alter table part_t add partition (month=3) set fileformat=parquet;
+</codeblock>
+
+ <p>
+ The value specified for a partition key can be an arbitrary constant expression, without any references to
+ columns. For example:
+ </p>
+
+<codeblock>alter table time_data add partition (month=concat('Decem','ber'));
+alter table sales_data add partition (zipcode = cast(9021 * 10 as string));</codeblock>
+
+ <note>
+ <p>
+ An alternative way to reorganize a table and its associated data files is to use <codeph>CREATE
+ TABLE</codeph> to create a variation of the original table, then use <codeph>INSERT</codeph> to copy the
+ transformed or reordered data to the new table. The advantage of <codeph>ALTER TABLE</codeph> is that it
+ avoids making a duplicate copy of the data files, allowing you to reorganize huge volumes of data in a
+ space-efficient way using familiar Hadoop techniques.
+ </p>
+ </note>
+
+ <p>
+ <b>To switch a table between internal and external:</b>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/switch_internal_external_table"/>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
+ <p rev="CDH-19187">
+ Most <codeph>ALTER TABLE</codeph> clauses do not actually
+ read or write any HDFS files, and so do not depend on
+ specific HDFS permissions. For example, the <codeph>SET FILEFORMAT</codeph>
+ clause does not actually check the file format existing data files or
+ convert them to the new format, and the <codeph>SET LOCATION</codeph> clause
+ does not require any special permissions on the new location.
+ (Any permission-related failures would come later, when you
+ actually query or insert into the table.)
+ </p>
+<!-- Haven't rigorously tested all the assertions in the following paragraph. -->
+<!-- Most testing so far has been around RENAME TO clause. -->
+ <p>
+ In general, <codeph>ALTER TABLE</codeph> clauses that do touch
+ HDFS files and directories require the same HDFS permissions
+ as corresponding <codeph>CREATE</codeph>, <codeph>INSERT</codeph>,
+ or <codeph>SELECT</codeph> statements.
+ The permissions allow
+ the user ID that the <cmdname>impalad</cmdname> daemon runs under,
+ typically the <codeph>impala</codeph> user, to read or write
+ files or directories, or (in the case of the execute bit) descend into a directory.
+ The <codeph>RENAME TO</codeph> clause requires read, write, and execute permission in the
+ source and destination database directories and in the table data directory,
+ and read and write permission for the data files within the table.
+ The <codeph>ADD PARTITION</codeph> and <codeph>DROP PARTITION</codeph> clauses
+ require write and execute permissions for the associated partition directory.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_tables.xml#tables"/>,
+ <xref href="impala_create_table.xml#create_table"/>, <xref href="impala_drop_table.xml#drop_table"/>,
+ <xref href="impala_partitioning.xml#partitioning"/>, <xref href="impala_tables.xml#internal_tables"/>,
+ <xref href="impala_tables.xml#external_tables"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_alter_view.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_alter_view.xml b/docs/topics/impala_alter_view.xml
new file mode 100644
index 0000000..0d83032
--- /dev/null
+++ b/docs/topics/impala_alter_view.xml
@@ -0,0 +1,73 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.1" id="alter_view">
+
+ <title>ALTER VIEW Statement</title>
+ <titlealts><navtitle>ALTER VIEW</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="DDL"/>
+ <data name="Category" value="Tables"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">ALTER VIEW statement</indexterm>
+ Changes the query associated with a view, or the associated database and/or name of the view.
+ </p>
+
+ <p>
+ Because a view is purely a logical construct (an alias for a query) with no physical data behind it,
+ <codeph>ALTER VIEW</codeph> only involves changes to metadata in the metastore database, not any data files
+ in HDFS.
+ </p>
+
+<!-- View _permissions_ don't rely on underlying table. -->
+
+<!-- Could use views to grant access only to certain columns. -->
+
+<!-- Treated like a table for authorization. -->
+
+<!-- ALTER VIEW that queries another view - possibly a runtime error. -->
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>ALTER VIEW [<varname>database_name</varname>.]<varname>view_name</varname> AS <varname>select_statement</varname>
+ALTER VIEW [<varname>database_name</varname>.]<varname>view_name</varname> RENAME TO [<varname>database_name</varname>.]<varname>view_name</varname></codeblock>
+
+ <p conref="../shared/impala_common.xml#common/ddl_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/security_blurb"/>
+ <p conref="../shared/impala_common.xml#common/redaction_yes"/>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>create table t1 (x int, y int, s string);
+create table t2 like t1;
+create view v1 as select * from t1;
+alter view v1 as select * from t2;
+alter view v1 as select x, upper(s) s from t2;</codeblock>
+
+<!-- Repeat the same blurb + example to see the definition of a view, as in CREATE VIEW. -->
+
+ <p conref="../shared/impala_common.xml#common/describe_formatted_view"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_views.xml#views"/>, <xref href="impala_create_view.xml#create_view"/>,
+ <xref href="impala_drop_view.xml#drop_view"/>
+ </p>
+ </conbody>
+</concept>
[07/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Posted by jr...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_operators.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_operators.xml b/docs/topics/impala_operators.xml
new file mode 100644
index 0000000..da3dab3
--- /dev/null
+++ b/docs/topics/impala_operators.xml
@@ -0,0 +1,1262 @@
+<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="operators">
+
+ <title>SQL Operators</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">operators</indexterm>
+ SQL operators are a class of comparison functions that are widely used within the <codeph>WHERE</codeph>
+ clauses of <codeph>SELECT</codeph> statements.
+ </p>
+
+ <p outputclass="toc inpage"/>
+ </conbody>
+
+ <concept rev="1.4.0" id="arithmetic_operators">
+
+ <title>Arithmetic Operators</title>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">arithmetic operators</indexterm>
+ The arithmetic operators use expressions with a left-hand argument, the operator, and then (in most cases)
+ a right-hand argument.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock><varname>left_hand_arg</varname> <varname>binary_operator</varname> <varname>right_hand_arg</varname>
+<varname>unary_operator</varname> <varname>single_arg</varname>
+</codeblock>
+
+ <ul>
+ <li>
+ <codeph>+</codeph> and <codeph>-</codeph>: Can be used either as unary or binary operators.
+ <ul>
+ <li>
+ <p>
+ With unary notation, such as <codeph>+5</codeph>, <codeph>-2.5</codeph>, or
+ <codeph>-<varname>col_name</varname></codeph>, they multiply their single numeric argument by
+ <codeph>+1</codeph> or <codeph>-1</codeph>. Therefore, unary <codeph>+</codeph> returns its
+ argument unchanged, while unary <codeph>-</codeph> flips the sign of its argument. Although you can
+ double up these operators in expressions such as <codeph>++5</codeph> (always positive) or
+ <codeph>-+2</codeph> or <codeph>+-2</codeph> (both always negative), you cannot double the unary
+ minus operator because <codeph>--</codeph> is interpreted as the start of a comment. (You can use a
+ double unary minus operator if you separate the <codeph>-</codeph> characters, for example with a
+ space or parentheses.)
+ </p>
+ </li>
+
+ <li>
+ <p>
+ With binary notation, such as <codeph>2+2</codeph>, <codeph>5-2.5</codeph>, or
+ <codeph><varname>col1</varname> + <varname>col2</varname></codeph>, they add or subtract
+ respectively the right-hand argument to (or from) the left-hand argument. Both arguments must be of
+ numeric types.
+ </p>
+ </li>
+ </ul>
+ </li>
+
+ <li>
+ <p>
+ <codeph>*</codeph> and <codeph>/</codeph>: Multiplication and division respectively. Both arguments
+ must be of numeric types.
+ </p>
+ <p>
+ When multiplying, the shorter argument is promoted if necessary (such as <codeph>SMALLINT</codeph> to
+ <codeph>INT</codeph> or <codeph>BIGINT</codeph>, or <codeph>FLOAT</codeph> to <codeph>DOUBLE</codeph>),
+ and then the result is promoted again to the next larger type. Thus, multiplying a
+ <codeph>TINYINT</codeph> and an <codeph>INT</codeph> produces a <codeph>BIGINT</codeph> result.
+ Multiplying a <codeph>FLOAT</codeph> and a <codeph>FLOAT</codeph> produces a <codeph>DOUBLE</codeph>
+ result. Multiplying a <codeph>FLOAT</codeph> and a <codeph>DOUBLE</codeph> or a <codeph>DOUBLE</codeph>
+ and a <codeph>DOUBLE</codeph> produces a <codeph>DECIMAL(38,17)</codeph>, because
+ <codeph>DECIMAL</codeph> values can represent much larger and more precise values than
+ <codeph>DOUBLE</codeph>.
+ </p>
+ <p>
+ When dividing, Impala always treats the arguments and result as <codeph>DOUBLE</codeph> values to avoid
+ losing precision. If you need to insert the results of a division operation into a
+ <codeph>FLOAT</codeph> column, use the <codeph>CAST()</codeph> function to convert the result to the
+ correct type.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ <codeph>%</codeph>: Modulo operator. Returns the remainder of the left-hand argument divided by the
+ right-hand argument. Both arguments must be of one of the integer types.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ <codeph>&</codeph>, <codeph>|</codeph>, <codeph>~</codeph>, and <codeph>^</codeph>: Bitwise operators that return the
+ logical AND, logical OR, <codeph>NOT</codeph>, or logical XOR (exclusive OR) of their argument values. Both arguments must be
+ of one of the integer types. If the arguments are of different type, the argument with the smaller type
+ is implicitly extended to match the argument with the longer type.
+ </p>
+ </li>
+ </ul>
+
+ <p>
+ You can chain a sequence of arithmetic expressions, optionally grouping them with parentheses.
+ </p>
+
+ <p>
+ The arithmetic operators generally do not have equivalent calling conventions using functional notation.
+ For example, prior to Impala 2.2.0 / CDH 5.4.0, there is no <codeph>MOD()</codeph> function equivalent to the <codeph>%</codeph> modulo
+ operator. Conversely, there are some arithmetic functions that do not have a corresponding operator. For
+ example, for exponentiation you use the <codeph>POW()</codeph> function, but there is no
+ <codeph>**</codeph> exponentiation operator. See <xref href="impala_math_functions.xml#math_functions"/>
+ for the arithmetic functions you can use.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_aggregation_explanation"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_aggregation_example"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_caveat_no_operator"/>
+
+ <p rev="2.3.0">
+ The following example shows how to do an arithmetic operation using a numeric field of a <codeph>STRUCT</codeph> type
+ that is an item within an <codeph>ARRAY</codeph> column. Once the scalar numeric value <codeph>R_NATIONKEY</codeph>
+ is extracted, it can be used in an arithmetic expression, such as multiplying by 10:
+ </p>
+
+<codeblock rev="2.3.0">
+-- The SMALLINT is a field within an array of structs.
+describe region;
++-------------+-------------------------+---------+
+| name | type | comment |
++-------------+-------------------------+---------+
+| r_regionkey | smallint | |
+| r_name | string | |
+| r_comment | string | |
+| r_nations | array<struct< | |
+| | n_nationkey:smallint, | |
+| | n_name:string, | |
+| | n_comment:string | |
+| | >> | |
++-------------+-------------------------+---------+
+
+-- When we refer to the scalar value using dot notation,
+-- we can use arithmetic and comparison operators on it
+-- like any other number.
+select r_name, nation.item.n_name, nation.item.n_nationkey * 10
+ from region, region.r_nations as nation
+where nation.item.n_nationkey < 5;
++-------------+-------------+------------------------------+
+| r_name | item.n_name | nation.item.n_nationkey * 10 |
++-------------+-------------+------------------------------+
+| AMERICA | CANADA | 30 |
+| AMERICA | BRAZIL | 20 |
+| AMERICA | ARGENTINA | 10 |
+| MIDDLE EAST | EGYPT | 40 |
+| AFRICA | ALGERIA | 0 |
++-------------+-------------+------------------------------+
+</codeblock>
+
+ </conbody>
+ </concept>
+
+ <concept id="between">
+
+ <title>BETWEEN Operator</title>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">BETWEEN operator</indexterm>
+ In a <codeph>WHERE</codeph> clause, compares an expression to both a lower and upper bound. The comparison
+ is successful is the expression is greater than or equal to the lower bound, and less than or equal to the
+ upper bound. If the bound values are switched, so the lower bound is greater than the upper bound, does not
+ match any values.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock><varname>expression</varname> BETWEEN <varname>lower_bound</varname> AND <varname>upper_bound</varname></codeblock>
+
+ <p>
+ <b>Data types:</b> Typically used with numeric data types. Works with any data type, although not very
+ practical for <codeph>BOOLEAN</codeph> values. (<codeph>BETWEEN false AND true</codeph> will match all
+ <codeph>BOOLEAN</codeph> values.) Use <codeph>CAST()</codeph> if necessary to ensure the lower and upper
+ bound values are compatible types. Call string or date/time functions if necessary to extract or transform
+ the relevant portion to compare, especially if the value can be transformed into a number.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ Be careful when using short string operands. A longer string that starts with the upper bound value will
+ not be included, because it is considered greater than the upper bound. For example, <codeph>BETWEEN 'A'
+ and 'M'</codeph> would not match the string value <codeph>'Midway'</codeph>. Use functions such as
+ <codeph>upper()</codeph>, <codeph>lower()</codeph>, <codeph>substr()</codeph>, <codeph>trim()</codeph>, and
+ so on if necessary to ensure the comparison works as expected.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_caveat_no_operator"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>-- Retrieve data for January through June, inclusive.
+select c1 from t1 where month <b>between 1 and 6</b>;
+
+-- Retrieve data for names beginning with 'A' through 'M' inclusive.
+-- Only test the first letter to ensure all the values starting with 'M' are matched.
+-- Do a case-insensitive comparison to match names with various capitalization conventions.
+select last_name from customers where upper(substr(last_name,1,1)) <b>between 'A' and 'M'</b>;
+
+-- Retrieve data for only the first week of each month.
+select count(distinct visitor_id)) from web_traffic where dayofmonth(when_viewed) <b>between 1 and 7</b>;</codeblock>
+
+ <p rev="2.3.0">
+ The following example shows how to do a <codeph>BETWEEN</codeph> comparison using a numeric field of a <codeph>STRUCT</codeph> type
+ that is an item within an <codeph>ARRAY</codeph> column. Once the scalar numeric value <codeph>R_NATIONKEY</codeph>
+ is extracted, it can be used in a comparison operator:
+ </p>
+
+<codeblock rev="2.3.0">
+-- The SMALLINT is a field within an array of structs.
+describe region;
++-------------+-------------------------+---------+
+| name | type | comment |
++-------------+-------------------------+---------+
+| r_regionkey | smallint | |
+| r_name | string | |
+| r_comment | string | |
+| r_nations | array<struct< | |
+| | n_nationkey:smallint, | |
+| | n_name:string, | |
+| | n_comment:string | |
+| | >> | |
++-------------+-------------------------+---------+
+
+-- When we refer to the scalar value using dot notation,
+-- we can use arithmetic and comparison operators on it
+-- like any other number.
+select r_name, nation.item.n_name, nation.item.n_nationkey
+from region, region.r_nations as nation
+where nation.item.n_nationkey between 3 and 5
++-------------+-------------+------------------+
+| r_name | item.n_name | item.n_nationkey |
++-------------+-------------+------------------+
+| AMERICA | CANADA | 3 |
+| MIDDLE EAST | EGYPT | 4 |
+| AFRICA | ETHIOPIA | 5 |
++-------------+-------------+------------------+
+</codeblock>
+
+ </conbody>
+ </concept>
+
+ <concept id="comparison_operators">
+
+ <title>Comparison Operators</title>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">comparison operators</indexterm>
+ Impala supports the familiar comparison operators for checking equality and sort order for the column data
+ types:
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock><varname>left_hand_expression</varname> <varname>comparison_operator</varname> <varname>right_hand_expression</varname></codeblock>
+
+ <ul>
+ <li>
+ <codeph>=</codeph>, <codeph>!=</codeph>, <codeph><></codeph>: apply to all types.
+ </li>
+
+ <li>
+ <codeph><</codeph>, <codeph><=</codeph>, <codeph>></codeph>, <codeph>>=</codeph>: apply to
+ all types; for <codeph>BOOLEAN</codeph>, <codeph>TRUE</codeph> is considered greater than
+ <codeph>FALSE</codeph>.
+ </li>
+ </ul>
+
+ <p>
+ <b>Alternatives:</b>
+ </p>
+
+ <p>
+ The <codeph>IN</codeph> and <codeph>BETWEEN</codeph> operators provide shorthand notation for expressing
+ combinations of equality, less than, and greater than comparisons with a single operator.
+ </p>
+
+ <p>
+ Because comparing any value to <codeph>NULL</codeph> produces <codeph>NULL</codeph> rather than
+ <codeph>TRUE</codeph> or <codeph>FALSE</codeph>, use the <codeph>IS NULL</codeph> and <codeph>IS NOT
+ NULL</codeph> operators to check if a value is <codeph>NULL</codeph> or not.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_caveat_no_operator"/>
+
+ <p rev="2.3.0">
+ The following example shows how to do an arithmetic operation using a numeric field of a <codeph>STRUCT</codeph> type
+ that is an item within an <codeph>ARRAY</codeph> column. Once the scalar numeric value <codeph>R_NATIONKEY</codeph>
+ is extracted, it can be used with a comparison operator such as <codeph><</codeph>:
+ </p>
+
+<codeblock rev="2.3.0">
+-- The SMALLINT is a field within an array of structs.
+describe region;
++-------------+-------------------------+---------+
+| name | type | comment |
++-------------+-------------------------+---------+
+| r_regionkey | smallint | |
+| r_name | string | |
+| r_comment | string | |
+| r_nations | array<struct< | |
+| | n_nationkey:smallint, | |
+| | n_name:string, | |
+| | n_comment:string | |
+| | >> | |
++-------------+-------------------------+---------+
+
+-- When we refer to the scalar value using dot notation,
+-- we can use arithmetic and comparison operators on it
+-- like any other number.
+select r_name, nation.item.n_name, nation.item.n_nationkey
+from region, region.r_nations as nation
+where nation.item.n_nationkey < 5
++-------------+-------------+------------------+
+| r_name | item.n_name | item.n_nationkey |
++-------------+-------------+------------------+
+| AMERICA | CANADA | 3 |
+| AMERICA | BRAZIL | 2 |
+| AMERICA | ARGENTINA | 1 |
+| MIDDLE EAST | EGYPT | 4 |
+| AFRICA | ALGERIA | 0 |
++-------------+-------------+------------------+
+</codeblock>
+
+ </conbody>
+ </concept>
+
+ <concept audience="Cloudera" rev="2.1.0" id="except">
+
+ <title>EXCEPT Operator</title>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">EXCEPT operator</indexterm>
+ </p>
+ </conbody>
+ </concept>
+
+ <concept rev="2.0.0" id="exists">
+
+ <title>EXISTS Operator</title>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">EXISTS operator</indexterm>
+ <indexterm audience="Cloudera">NOT EXISTS operator</indexterm>
+ The <codeph>EXISTS</codeph> operator tests whether a subquery returns any results.
+ You typically use it to find values from one table that have corresponding values in another table.
+ </p>
+
+ <p>
+ The converse, <codeph>NOT EXISTS</codeph>, helps to find all the values from one table that do not have any
+ corresponding values in another table.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>EXISTS (<varname>subquery</varname>)
+NOT EXISTS (<varname>subquery</varname>)
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ The subquery can refer to a different table than the outer query block, or the same table. For example, you
+ might use <codeph>EXISTS</codeph> or <codeph>NOT EXISTS</codeph> to check the existence of parent/child
+ relationships between two columns of the same table.
+ </p>
+
+ <p>
+ You can also use operators and function calls within the subquery to test for other kinds of relationships
+ other than strict equality. For example, you might use a call to <codeph>COUNT()</codeph> in the subquery
+ to check whether the number of matching values is higher or lower than some limit. You might call a UDF in
+ the subquery to check whether values in one table matches a hashed representation of those same values in a
+ different table.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/null_blurb"/>
+
+ <p>
+ If the subquery returns any value at all (even <codeph>NULL</codeph>), <codeph>EXISTS</codeph> returns
+ <codeph>TRUE</codeph> and <codeph>NOT EXISTS</codeph> returns false.
+ </p>
+
+ <p>
+ The following example shows how even when the subquery returns only <codeph>NULL</codeph> values,
+ <codeph>EXISTS</codeph> still returns <codeph>TRUE</codeph> and thus matches all the rows from the table in
+ the outer query block.
+ </p>
+
+<codeblock>[localhost:21000] > create table all_nulls (x int);
+[localhost:21000] > insert into all_nulls values (null), (null), (null);
+[localhost:21000] > select y from t2 where exists (select x from all_nulls);
++---+
+| y |
++---+
+| 2 |
+| 4 |
+| 6 |
++---+
+</codeblock>
+
+ <p>
+ However, if the table in the subquery is empty and so the subquery returns an empty result set,
+ <codeph>EXISTS</codeph> returns <codeph>FALSE</codeph>:
+ </p>
+
+<codeblock>[localhost:21000] > create table empty (x int);
+[localhost:21000] > select y from t2 where exists (select x from empty);
+[localhost:21000] >
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/added_in_20"/>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/subquery_no_limit"/>
+
+ <p>
+ The <codeph>NOT EXISTS</codeph> operator requires a correlated subquery.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_caveat_no_operator"/>
+
+<!-- To do: construct an EXISTS / NOT EXISTS example for complex types. -->
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+<!-- Maybe turn this into a conref if the same set of tables gets used for subqueries, EXISTS, other places. -->
+<!-- Yes, the material was reused under Subqueries for anti-joins. -->
+ The following examples refer to these simple tables containing small sets of integers or strings:
+<codeblock>[localhost:21000] > create table t1 (x int);
+[localhost:21000] > insert into t1 values (1), (2), (3), (4), (5), (6);
+
+[localhost:21000] > create table t2 (y int);
+[localhost:21000] > insert into t2 values (2), (4), (6);
+
+[localhost:21000] > create table t3 (z int);
+[localhost:21000] > insert into t3 values (1), (3), (5);
+
+[localhost:21000] > create table month_names (m string);
+[localhost:21000] > insert into month_names values
+ > ('January'), ('February'), ('March'),
+ > ('April'), ('May'), ('June'), ('July'),
+ > ('August'), ('September'), ('October'),
+ > ('November'), ('December');
+</codeblock>
+ </p>
+
+ <p>
+ The following example shows a correlated subquery that finds all the values in one table that exist in
+ another table. For each value <codeph>X</codeph> from <codeph>T1</codeph>, the query checks if the
+ <codeph>Y</codeph> column of <codeph>T2</codeph> contains an identical value, and the
+ <codeph>EXISTS</codeph> operator returns <codeph>TRUE</codeph> or <codeph>FALSE</codeph> as appropriate in
+ each case.
+ </p>
+
+<codeblock>localhost:21000] > select x from t1 where exists (select y from t2 where t1.x = y);
++---+
+| x |
++---+
+| 2 |
+| 4 |
+| 6 |
++---+
+</codeblock>
+
+ <p>
+ An uncorrelated query is less interesting in this case. Because the subquery always returns
+ <codeph>TRUE</codeph>, all rows from <codeph>T1</codeph> are returned. If the table contents where changed
+ so that the subquery did not match any rows, none of the rows from <codeph>T1</codeph> would be returned.
+ </p>
+
+<codeblock>[localhost:21000] > select x from t1 where exists (select y from t2 where y > 5);
++---+
+| x |
++---+
+| 1 |
+| 2 |
+| 3 |
+| 4 |
+| 5 |
+| 6 |
++---+
+</codeblock>
+
+ <p>
+ The following example shows how an uncorrelated subquery can test for the existence of some condition
+ within a table. By using <codeph>LIMIT 1</codeph> or an aggregate function, the query returns a single
+ result or no result based on whether the subquery matches any rows. Here, we know that <codeph>T1</codeph>
+ and <codeph>T2</codeph> contain some even numbers, but <codeph>T3</codeph> does not.
+ </p>
+
+<codeblock>[localhost:21000] > select "contains an even number" from t1 where exists (select x from t1 where x % 2 = 0) limit 1;
++---------------------------+
+| 'contains an even number' |
++---------------------------+
+| contains an even number |
++---------------------------+
+[localhost:21000] > select "contains an even number" as assertion from t1 where exists (select x from t1 where x % 2 = 0) limit 1;
++-------------------------+
+| assertion |
++-------------------------+
+| contains an even number |
++-------------------------+
+[localhost:21000] > select "contains an even number" as assertion from t2 where exists (select x from t2 where y % 2 = 0) limit 1;
+ERROR: AnalysisException: couldn't resolve column reference: 'x'
+[localhost:21000] > select "contains an even number" as assertion from t2 where exists (select y from t2 where y % 2 = 0) limit 1;
++-------------------------+
+| assertion |
++-------------------------+
+| contains an even number |
++-------------------------+
+[localhost:21000] > select "contains an even number" as assertion from t3 where exists (select z from t3 where z % 2 = 0) limit 1;
+[localhost:21000] >
+</codeblock>
+
+ <p>
+ The following example finds numbers in one table that are 1 greater than numbers from another table. The
+ <codeph>EXISTS</codeph> notation is simpler than an equivalent <codeph>CROSS JOIN</codeph> between the
+ tables. (The example then also illustrates how the same test could be performed using an
+ <codeph>IN</codeph> operator.)
+ </p>
+
+<codeblock>[localhost:21000] > select x from t1 where exists (select y from t2 where x = y + 1);
++---+
+| x |
++---+
+| 3 |
+| 5 |
++---+
+[localhost:21000] > select x from t1 where x in (select y + 1 from t2);
++---+
+| x |
++---+
+| 3 |
+| 5 |
++---+
+</codeblock>
+
+ <p>
+ The following example finds values from one table that do not exist in another table.
+ </p>
+
+<codeblock>[localhost:21000] > select x from t1 where not exists (select y from t2 where x = y);
++---+
+| x |
++---+
+| 1 |
+| 3 |
+| 5 |
++---+
+</codeblock>
+
+ <p>
+ The following example uses the <codeph>NOT EXISTS</codeph> operator to find all the leaf nodes in
+ tree-structured data. This simplified <q>tree of life</q> has multiple levels (class, order, family, and so
+ on), with each item pointing upward through a <codeph>PARENT</codeph> pointer. The example runs an outer
+ query and a subquery on the same table, returning only those items whose <codeph>ID</codeph> value is
+ <i>not</i> referenced by the <codeph>PARENT</codeph> of any other item.
+ </p>
+
+<codeblock>[localhost:21000] > create table tree (id int, parent int, name string);
+[localhost:21000] > insert overwrite tree values
+ > (0, null, "animals"),
+ > (1, 0, "placentals"),
+ > (2, 0, "marsupials"),
+ > (3, 1, "bats"),
+ > (4, 1, "cats"),
+ > (5, 2, "kangaroos"),
+ > (6, 4, "lions"),
+ > (7, 4, "tigers"),
+ > (8, 5, "red kangaroo"),
+ > (9, 2, "wallabies");
+[localhost:21000] > select name as "leaf node" from tree one
+ > where not exists (select parent from tree two where one.id = two.parent);
++--------------+
+| leaf node |
++--------------+
+| bats |
+| lions |
+| tigers |
+| red kangaroo |
+| wallabies |
++--------------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_subqueries.xml#subqueries"/>
+ </p>
+ </conbody>
+ </concept>
+
+ <concept id="in">
+
+ <title>IN Operator</title>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">IN operator</indexterm>
+ <indexterm audience="Cloudera">NOT IN operator</indexterm>
+ The <codeph>IN</codeph> operator compares an argument value to a set of values, and returns
+ <codeph>TRUE</codeph> if the argument matches any value in the set. The <codeph>NOT IN</codeph> operator
+ reverses the comparison, and checks if the argument value is not part of a set of values.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock rev="2.0.0"><varname>expression</varname> IN (<varname>expression</varname> [, <varname>expression</varname>])
+<varname>expression</varname> IN (<varname>subquery</varname>)
+
+<varname>expression</varname> NOT IN (<varname>expression</varname> [, <varname>expression</varname>])
+<varname>expression</varname> NOT IN (<varname>subquery</varname>)
+</codeblock>
+
+ <p>
+ The left-hand expression and the set of comparison values must be of compatible types.
+ </p>
+
+ <p>
+ The left-hand expression must consist only of a single value, not a tuple. Although the left-hand
+ expression is typically a column name, it could also be some other value. For example, the
+ <codeph>WHERE</codeph> clauses <codeph>WHERE id IN (5)</codeph> and <codeph>WHERE 5 IN (id)</codeph>
+ produce the same results.
+ </p>
+
+ <p>
+ The set of values to check against can be specified as constants, function calls, column names, or other
+ expressions in the query text. When the values are listed explicitly, the maximum number of expressions is
+ 10,000.
+ </p>
+
+ <p rev="2.0.0">
+ In Impala 2.0 and higher, the set of values can also be generated by a subquery. <codeph>IN</codeph> can
+ evaluate an unlimited number of results using a subquery.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ Any expression using the <codeph>IN</codeph> operator could be rewritten as a series of equality tests
+ connected with <codeph>OR</codeph>, but the <codeph>IN</codeph> syntax is often clearer, more concise, and
+ easier for Impala to optimize. For example, with partitioned tables, queries frequently use
+ <codeph>IN</codeph> clauses to filter data by comparing the partition key columns to specific values.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/null_blurb"/>
+
+ <p>
+ If there really is a matching non-null value, <codeph>IN</codeph> returns <codeph>TRUE</codeph>:
+ </p>
+
+<codeblock>[localhost:21000] > select 1 in (1,null,2,3);
++----------------------+
+| 1 in (1, null, 2, 3) |
++----------------------+
+| true |
++----------------------+
+[localhost:21000] > select 1 not in (1,null,2,3);
++--------------------------+
+| 1 not in (1, null, 2, 3) |
++--------------------------+
+| false |
++--------------------------+
+</codeblock>
+
+ <p>
+ If the searched value is not found in the comparison values, and the comparison values include
+ <codeph>NULL</codeph>, the result is <codeph>NULL</codeph>:
+ </p>
+
+<codeblock>[localhost:21000] > select 5 in (1,null,2,3);
++----------------------+
+| 5 in (1, null, 2, 3) |
++----------------------+
+| NULL |
++----------------------+
+[localhost:21000] > select 5 not in (1,null,2,3);
++--------------------------+
+| 5 not in (1, null, 2, 3) |
++--------------------------+
+| NULL |
++--------------------------+
+[localhost:21000] > select 1 in (null);
++-------------+
+| 1 in (null) |
++-------------+
+| NULL |
++-------------+
+[localhost:21000] > select 1 not in (null);
++-----------------+
+| 1 not in (null) |
++-----------------+
+| NULL |
++-----------------+
+</codeblock>
+
+ <p>
+ If the left-hand argument is <codeph>NULL</codeph>, <codeph>IN</codeph> always returns
+ <codeph>NULL</codeph>. This rule applies even if the comparison values include <codeph>NULL</codeph>.
+ </p>
+
+<codeblock>[localhost:21000] > select null in (1,2,3);
++-------------------+
+| null in (1, 2, 3) |
++-------------------+
+| NULL |
++-------------------+
+[localhost:21000] > select null not in (1,2,3);
++-----------------------+
+| null not in (1, 2, 3) |
++-----------------------+
+| NULL |
++-----------------------+
+[localhost:21000] > select null in (null);
++----------------+
+| null in (null) |
++----------------+
+| NULL |
++----------------+
+[localhost:21000] > select null not in (null);
++--------------------+
+| null not in (null) |
++--------------------+
+| NULL |
++--------------------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/enhanced_in_20"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_caveat_no_operator"/>
+
+ <p rev="2.3.0">
+ The following example shows how to do an arithmetic operation using a numeric field of a <codeph>STRUCT</codeph> type
+ that is an item within an <codeph>ARRAY</codeph> column. Once the scalar numeric value <codeph>R_NATIONKEY</codeph>
+ is extracted, it can be used in an arithmetic expression, such as multiplying by 10:
+ </p>
+
+<codeblock rev="2.3.0">
+-- The SMALLINT is a field within an array of structs.
+describe region;
++-------------+-------------------------+---------+
+| name | type | comment |
++-------------+-------------------------+---------+
+| r_regionkey | smallint | |
+| r_name | string | |
+| r_comment | string | |
+| r_nations | array<struct< | |
+| | n_nationkey:smallint, | |
+| | n_name:string, | |
+| | n_comment:string | |
+| | >> | |
++-------------+-------------------------+---------+
+
+-- When we refer to the scalar value using dot notation,
+-- we can use arithmetic and comparison operators on it
+-- like any other number.
+select r_name, nation.item.n_name, nation.item.n_nationkey
+from region, region.r_nations as nation
+where nation.item.n_nationkey in (1,3,5)
++---------+-------------+------------------+
+| r_name | item.n_name | item.n_nationkey |
++---------+-------------+------------------+
+| AMERICA | CANADA | 3 |
+| AMERICA | ARGENTINA | 1 |
+| AFRICA | ETHIOPIA | 5 |
++---------+-------------+------------------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/subquery_no_limit"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>-- Using IN is concise and self-documenting.
+SELECT * FROM t1 WHERE c1 IN (1,2,10);
+-- Equivalent to series of = comparisons ORed together.
+SELECT * FROM t1 WHERE c1 = 1 OR c1 = 2 OR c1 = 10;
+
+SELECT c1 AS "starts with vowel" FROM t2 WHERE upper(substr(c1,1,1)) IN ('A','E','I','O','U');
+
+SELECT COUNT(DISTINCT(visitor_id)) FROM web_traffic WHERE month IN ('January','June','July');</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_subqueries.xml#subqueries"/>
+ </p>
+ </conbody>
+ </concept>
+
+ <concept audience="Cloudera" rev="2.1.0" id="intersect">
+
+ <title>INTERSECT Operator</title>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">INTERSECT operator</indexterm>
+ </p>
+ </conbody>
+ </concept>
+
+ <concept id="is_null">
+
+ <title>IS NULL Operator</title>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">IS NULL operator</indexterm>
+ <indexterm audience="Cloudera">IS NOT NULL operator</indexterm>
+ The <codeph>IS NULL</codeph> operator, and its converse the <codeph>IS NOT NULL</codeph> operator, test
+ whether a specified value is <codeph><xref href="impala_literals.xml#null">NULL</xref></codeph>. Because
+ using <codeph>NULL</codeph> with any of the other comparison operators such as <codeph>=</codeph> or
+ <codeph>!=</codeph> also returns <codeph>NULL</codeph> rather than <codeph>TRUE</codeph> or
+ <codeph>FALSE</codeph>, you use a special-purpose comparison operator to check for this special condition.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock><varname>expression</varname> IS NULL
+<varname>expression</varname> IS NOT NULL
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ In many cases, <codeph>NULL</codeph> values indicate some incorrect or incomplete processing during data
+ ingestion or conversion. You might check whether any values in a column are <codeph>NULL</codeph>, and if
+ so take some followup action to fill them in.
+ </p>
+
+ <p>
+ With sparse data, often represented in <q>wide</q> tables, it is common for most values to be
+ <codeph>NULL</codeph> with only an occasional non-<codeph>NULL</codeph> value. In those cases, you can use
+ the <codeph>IS NOT NULL</codeph> operator to identify the rows containing any data at all for a particular
+ column, regardless of the actual value.
+ </p>
+
+ <p>
+ With a well-designed database schema, effective use of <codeph>NULL</codeph> values and <codeph>IS
+ NULL</codeph> and <codeph>IS NOT NULL</codeph> operators can save having to design custom logic around
+ special values such as 0, -1, <codeph>'N/A'</codeph>, empty string, and so on. <codeph>NULL</codeph> lets
+ you distinguish between a value that is known to be 0, false, or empty, and a truly unknown value.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>-- If this value is non-zero, something is wrong.
+select count(*) from employees where employee_id is null;
+
+-- With data from disparate sources, some fields might be blank.
+-- Not necessarily an error condition.
+select count(*) from census where household_income is null;
+
+-- Sometimes we expect fields to be null, and followup action
+-- is needed when they are not.
+select count(*) from web_traffic where weird_http_code is not null;</codeblock>
+ </conbody>
+ </concept>
+
+ <concept id="like">
+
+ <title>LIKE Operator</title>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">LIKE operator</indexterm>
+ A comparison operator for <codeph>STRING</codeph> data, with basic wildcard capability using
+ <codeph>_</codeph> to match a single character and <codeph>%</codeph> to match multiple characters. The
+ argument expression must match the entire string value. Typically, it is more efficient to put any
+ <codeph>%</codeph> wildcard match at the end of the string.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock><varname>string_expression</varname> LIKE <varname>wildcard_expression</varname>
+<varname>string_expression</varname> NOT LIKE <varname>wildcard_expression</varname>
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_caveat_no_operator"/>
+
+<!-- To do: construct a LIKE example for complex types. -->
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>select distinct c_last_name from customer where c_last_name like 'Mc%' or c_last_name like 'Mac%';
+select count(c_last_name) from customer where c_last_name like 'M%';
+select c_email_address from customer where c_email_address like '%.edu';
+
+-- We can find 4-letter names beginning with 'M' by calling functions...
+select distinct c_last_name from customer where length(c_last_name) = 4 and substr(c_last_name,1,1) = 'M';
+-- ...or in a more readable way by matching M followed by exactly 3 characters.
+select distinct c_last_name from customer where c_last_name like 'M___';</codeblock>
+
+ <p>
+ For a more general kind of search operator using regular expressions, see
+ <xref href="impala_operators.xml#regexp"/>.
+ </p>
+ </conbody>
+ </concept>
+
+ <concept id="logical_operators">
+
+ <title>Logical Operators</title>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">logical operators</indexterm>
+ Logical operators return a <codeph>BOOLEAN</codeph> value, based on a binary or unary logical operation
+ between arguments that are also Booleans. Typically, the argument expressions use
+ <xref href="impala_operators.xml#comparison_operators">comparison operators</xref>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock><varname>boolean_expression</varname> <varname>binary_logical_operator</varname> <varname>boolean_expression</varname>
+<varname>unary_logical_operator</varname> <varname>boolean_expression</varname>
+</codeblock>
+
+ <p>
+ The Impala logical operators are:
+ </p>
+
+ <ul>
+ <li>
+ <codeph>AND</codeph>: A binary operator that returns <codeph>true</codeph> if its left-hand and
+ right-hand arguments both evaluate to <codeph>true</codeph>, <codeph>NULL</codeph> if either argument is
+ <codeph>NULL</codeph>, and <codeph>false</codeph> otherwise.
+ </li>
+
+ <li>
+ <codeph>OR</codeph>: A binary operator that returns <codeph>true</codeph> if either of its left-hand and
+ right-hand arguments evaluate to <codeph>true</codeph>, <codeph>NULL</codeph> if one argument is
+ <codeph>NULL</codeph> and the other is either <codeph>NULL</codeph> or <codeph>false</codeph>, and
+ <codeph>false</codeph> otherwise.
+ </li>
+
+ <li>
+ <codeph>NOT</codeph>: A unary operator that flips the state of a Boolean expression from
+ <codeph>true</codeph> to <codeph>false</codeph>, or <codeph>false</codeph> to <codeph>true</codeph>. If
+ the argument expression is <codeph>NULL</codeph>, the result remains <codeph>NULL</codeph>. (When
+ <codeph>NOT</codeph> is used this way as a unary logical operator, it works differently than the
+ <codeph>IS NOT NULL</codeph> comparison operator, which returns <codeph>true</codeph> when applied to a
+ <codeph>NULL</codeph>.)
+ </li>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_caveat_no_operator"/>
+
+ <p rev="2.3.0">
+ The following example shows how to do an arithmetic operation using a numeric field of a <codeph>STRUCT</codeph> type
+ that is an item within an <codeph>ARRAY</codeph> column. Once the scalar numeric value <codeph>R_NATIONKEY</codeph>
+ is extracted, it can be used in an arithmetic expression, such as multiplying by 10:
+ </p>
+
+<codeblock rev="2.3.0">
+-- The SMALLINT is a field within an array of structs.
+describe region;
++-------------+-------------------------+---------+
+| name | type | comment |
++-------------+-------------------------+---------+
+| r_regionkey | smallint | |
+| r_name | string | |
+| r_comment | string | |
+| r_nations | array<struct< | |
+| | n_nationkey:smallint, | |
+| | n_name:string, | |
+| | n_comment:string | |
+| | >> | |
++-------------+-------------------------+---------+
+
+-- When we refer to the scalar value using dot notation,
+-- we can use arithmetic and comparison operators on it
+-- like any other number.
+select r_name, nation.item.n_name, nation.item.n_nationkey
+ from region, region.r_nations as nation
+where
+ nation.item.n_nationkey between 3 and 5
+ or nation.item.n_nationkey < 15;
++-------------+----------------+------------------+
+| r_name | item.n_name | item.n_nationkey |
++-------------+----------------+------------------+
+| EUROPE | UNITED KINGDOM | 23 |
+| EUROPE | RUSSIA | 22 |
+| EUROPE | ROMANIA | 19 |
+| ASIA | VIETNAM | 21 |
+| ASIA | CHINA | 18 |
+| AMERICA | UNITED STATES | 24 |
+| AMERICA | PERU | 17 |
+| AMERICA | CANADA | 3 |
+| MIDDLE EAST | SAUDI ARABIA | 20 |
+| MIDDLE EAST | EGYPT | 4 |
+| AFRICA | MOZAMBIQUE | 16 |
+| AFRICA | ETHIOPIA | 5 |
++-------------+----------------+------------------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ These examples demonstrate the <codeph>AND</codeph> operator:
+ </p>
+
+<codeblock>[localhost:21000] > select true and true;
++---------------+
+| true and true |
++---------------+
+| true |
++---------------+
+[localhost:21000] > select true and false;
++----------------+
+| true and false |
++----------------+
+| false |
++----------------+
+[localhost:21000] > select false and false;
++-----------------+
+| false and false |
++-----------------+
+| false |
++-----------------+
+[localhost:21000] > select true and null;
++---------------+
+| true and null |
++---------------+
+| NULL |
++---------------+
+[localhost:21000] > select (10 > 2) and (6 != 9);
++-----------------------+
+| (10 > 2) and (6 != 9) |
++-----------------------+
+| true |
++-----------------------+
+</codeblock>
+
+ <p>
+ These examples demonstrate the <codeph>OR</codeph> operator:
+ </p>
+
+<codeblock>[localhost:21000] > select true or true;
++--------------+
+| true or true |
++--------------+
+| true |
++--------------+
+[localhost:21000] > select true or false;
++---------------+
+| true or false |
++---------------+
+| true |
++---------------+
+[localhost:21000] > select false or false;
++----------------+
+| false or false |
++----------------+
+| false |
++----------------+
+[localhost:21000] > select true or null;
++--------------+
+| true or null |
++--------------+
+| true |
++--------------+
+[localhost:21000] > select null or true;
++--------------+
+| null or true |
++--------------+
+| true |
++--------------+
+[localhost:21000] > select false or null;
++---------------+
+| false or null |
++---------------+
+| NULL |
++---------------+
+[localhost:21000] > select (1 = 1) or ('hello' = 'world');
++--------------------------------+
+| (1 = 1) or ('hello' = 'world') |
++--------------------------------+
+| true |
++--------------------------------+
+[localhost:21000] > select (2 + 2 != 4) or (-1 > 0);
++--------------------------+
+| (2 + 2 != 4) or (-1 > 0) |
++--------------------------+
+| false |
++--------------------------+
+</codeblock>
+
+ <p>
+ These examples demonstrate the <codeph>NOT</codeph> operator:
+ </p>
+
+<codeblock>[localhost:21000] > select not true;
++----------+
+| not true |
++----------+
+| false |
++----------+
+[localhost:21000] > select not false;
++-----------+
+| not false |
++-----------+
+| true |
++-----------+
+[localhost:21000] > select not null;
++----------+
+| not null |
++----------+
+| NULL |
++----------+
+[localhost:21000] > select not (1=1);
++-------------+
+| not (1 = 1) |
++-------------+
+| false |
++-------------+
+</codeblock>
+ </conbody>
+ </concept>
+
+ <concept id="regexp">
+
+ <title>REGEXP Operator</title>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">REGEXP operator</indexterm>
+ Tests whether a value matches a regular expression. Uses the POSIX regular expression syntax where
+ <codeph>^</codeph> and <codeph>$</codeph> match the beginning and end of the string, <codeph>.</codeph>
+ represents any single character, <codeph>*</codeph> represents a sequence of zero or more items,
+ <codeph>+</codeph> represents a sequence of one or more items, <codeph>?</codeph> produces a non-greedy
+ match, and so on.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+<codeblock><varname>string_expression</varname> REGEXP <varname>regular_expression</varname>
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ The regular expression must match the entire value, not just occur somewhere inside it. Use
+ <codeph>.*</codeph> at the beginning and/or the end if you only need to match characters anywhere in the
+ middle. Thus, the <codeph>^</codeph> and <codeph>$</codeph> atoms are often redundant, although you might
+ already have them in your expression strings that you reuse from elsewhere.
+ </p>
+
+ <p>
+ The <codeph>RLIKE</codeph> operator is a synonym for <codeph>REGEXP</codeph>.
+ </p>
+
+ <p>
+ The <codeph>|</codeph> symbol is the alternation operator, typically used within <codeph>()</codeph> to
+ match different sequences. The <codeph>()</codeph> groups do not allow backreferences. To retrieve the part
+ of a value matched within a <codeph>()</codeph> section, use the
+ <codeph><xref href="impala_string_functions.xml#string_functions/regexp_extract">regexp_extract()</xref></codeph>
+ built-in function.
+ </p>
+
+ <note rev="1.3.1">
+ <p conref="../shared/impala_common.xml#common/regexp_matching"/>
+ </note>
+
+ <p conref="../shared/impala_common.xml#common/regexp_re2"/>
+
+ <p conref="../shared/impala_common.xml#common/regexp_re2_warning"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_caveat_no_operator"/>
+
+<!-- To do: construct a REGEXP example for complex types. -->
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following examples demonstrate the identical syntax for the <codeph>REGEXP</codeph> and
+ <codeph>RLIKE</codeph> operators.
+ </p>
+
+<!-- Same examples shown for both REGEXP and RLIKE operators. -->
+
+<codeblock conref="../shared/impala_common.xml#common/regexp_rlike_examples"/>
+ </conbody>
+ </concept>
+
+ <concept id="rlike">
+
+ <title>RLIKE Operator</title>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">RLIKE operator</indexterm>
+ Synonym for the <codeph>REGEXP</codeph> operator. See <xref href="impala_operators.xml#regexp"/> for
+ details.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following examples demonstrate the identical syntax for the <codeph>REGEXP</codeph> and
+ <codeph>RLIKE</codeph> operators.
+ </p>
+
+<!-- Same examples shown for both REGEXP and RLIKE operators. -->
+
+<codeblock conref="../shared/impala_common.xml#common/regexp_rlike_examples"/>
+ </conbody>
+ </concept>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_order_by.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_order_by.xml b/docs/topics/impala_order_by.xml
new file mode 100644
index 0000000..f3042e5
--- /dev/null
+++ b/docs/topics/impala_order_by.xml
@@ -0,0 +1,316 @@
+<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="order_by">
+
+ <title>ORDER BY Clause</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ The familiar <codeph>ORDER BY</codeph> clause of a <codeph>SELECT</codeph> statement sorts the result set
+ based on the values from one or more columns.
+ </p>
+
+ <p>
+ For distributed queries, this is a relatively expensive operation, because the entire result set must be
+ produced and transferred to one node before the sorting can happen. This can require more memory capacity
+ than a query without <codeph>ORDER BY</codeph>. Even if the query takes approximately the same time to finish
+ with or without the <codeph>ORDER BY</codeph> clause, subjectively it can appear slower because no results
+ are available until all processing is finished, rather than results coming back gradually as rows matching
+ the <codeph>WHERE</codeph> clause are found. Therefore, if you only need the first N results from the sorted
+ result set, also include the <codeph>LIMIT</codeph> clause, which reduces network overhead and the memory
+ requirement on the coordinator node.
+ </p>
+
+ <note>
+ <p rev="1.4.0 obwl">
+ In Impala 1.4.0 and higher, the <codeph>LIMIT</codeph> clause is now optional (rather than required) for
+ queries that use the <codeph>ORDER BY</codeph> clause. Impala automatically uses a temporary disk work area
+ to perform the sort if the sort operation would otherwise exceed the Impala memory limit for a particular
+ data node.
+ </p>
+ </note>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+ <p>
+ The full syntax for the <codeph>ORDER BY</codeph> clause is:
+ </p>
+
+<codeblock rev="1.2.1">ORDER BY <varname>col_ref</varname> [, <varname>col_ref</varname> ...] [ASC | DESC] [NULLS FIRST | NULLS LAST]
+
+col_ref ::= <varname>column_name</varname> | <varname>integer_literal</varname>
+</codeblock>
+
+ <p>
+ Although the most common usage is <codeph>ORDER BY <varname>column_name</varname></codeph>, you can also
+ specify <codeph>ORDER BY 1</codeph> to sort by the first column of the result set, <codeph>ORDER BY
+ 2</codeph> to sort by the second column, and so on. The number must be a numeric literal, not some other kind
+ of constant expression. (If the argument is some other expression, even a <codeph>STRING</codeph> value, the
+ query succeeds but the order of results is undefined.)
+ </p>
+
+ <p>
+ <codeph>ORDER BY <varname>column_number</varname></codeph> can only be used when the query explicitly lists
+ the columns in the <codeph>SELECT</codeph> list, not with <codeph>SELECT *</codeph> queries.
+ </p>
+
+ <p>
+ <b>Ascending and descending sorts:</b>
+ </p>
+
+ <p>
+ The default sort order (the same as using the <codeph>ASC</codeph> keyword) puts the smallest values at the
+ start of the result set, and the largest values at the end. Specifying the <codeph>DESC</codeph> keyword
+ reverses that order.
+ </p>
+
+ <p>
+ <b>Sort order for NULL values:</b>
+ </p>
+
+ <p rev="1.2.1">
+ See <xref href="impala_literals.xml#null"/> for details about how <codeph>NULL</codeph> values are positioned
+ in the sorted result set, and how to use the <codeph>NULLS FIRST</codeph> and <codeph>NULLS LAST</codeph>
+ clauses. (The sort position for <codeph>NULL</codeph> values in <codeph>ORDER BY ... DESC</codeph> queries is
+ changed in Impala 1.2.1 and higher to be more standards-compliant, and the <codeph>NULLS FIRST</codeph> and
+ <codeph>NULLS LAST</codeph> keywords are new in Impala 1.2.1.)
+ </p>
+
+ <p rev="obwl" conref="../shared/impala_common.xml#common/order_by_limit"/>
+
+ <!-- Good to show an example of cases where ORDER BY does and doesn't work with complex types. -->
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p rev="2.3.0">
+ In CDH 5.5 / Impala 2.3 and higher, the complex data types <codeph>STRUCT</codeph>,
+ <codeph>ARRAY</codeph>, and <codeph>MAP</codeph> are available. These columns cannot
+ be referenced directly in the <codeph>ORDER BY</codeph> clause.
+ When you query a complex type column, you use join notation to <q>unpack</q> the elements
+ of the complex type, and within the join query you can include an <codeph>ORDER BY</codeph>
+ clause to control the order in the result set of the scalar elements from the complex type.
+ See <xref href="impala_complex_types.xml#complex_types"/> for details about Impala support for complex types.
+ </p>
+
+ <p>
+ The following query shows how a complex type column cannot be directly used in an <codeph>ORDER BY</codeph> clause:
+ </p>
+
+<codeblock>CREATE TABLE games (id BIGINT, score ARRAY <BIGINT>) STORED AS PARQUET;
+...use LOAD DATA to load externally created Parquet files into the table...
+SELECT id FROM games ORDER BY score DESC;
+ERROR: AnalysisException: ORDER BY expression 'score' with complex type 'ARRAY<BIGINT>' is not supported.
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following query retrieves the user ID and score, only for scores greater than one million,
+ with the highest scores for each user listed first.
+ Because the individual array elements are now represented as separate rows in the result set,
+ they can be used in the <codeph>ORDER BY</codeph> clause, referenced using the <codeph>ITEM</codeph>
+ pseudocolumn that represents each array element.
+ </p>
+
+<codeblock>SELECT id, item FROM games, games.score
+ WHERE item > 1000000
+ORDER BY id, item desc;
+</codeblock>
+
+ <p>
+ The following queries use similar <codeph>ORDER BY</codeph> techniques with variations of the <codeph>GAMES</codeph>
+ table, where the complex type is an <codeph>ARRAY</codeph> containing <codeph>STRUCT</codeph> or <codeph>MAP</codeph>
+ elements to represent additional details about each game that was played.
+ For an array of structures, the fields of the structure are referenced as <codeph>ITEM.<varname>field_name</varname></codeph>.
+ For an array of maps, the keys and values within each array element are referenced as <codeph>ITEM.KEY</codeph>
+ and <codeph>ITEM.VALUE</codeph>.
+ </p>
+
+<codeblock>CREATE TABLE games2 (id BIGINT, play array < struct <game_name: string, score: BIGINT, high_score: boolean> >) STORED AS PARQUET
+...use LOAD DATA to load externally created Parquet files into the table...
+SELECT id, item.game_name, item.score FROM games2, games2.play
+ WHERE item.score > 1000000
+ORDER BY id, item.score DESC;
+
+CREATE TABLE games3 (id BIGINT, play ARRAY < MAP <STRING, BIGINT> >) STORED AS PARQUET;
+...use LOAD DATA to load externally created Parquet files into the table...
+SELECT id, info.key AS k, info.value AS v from games3, games3.play AS plays, games3.play.item AS info
+ WHERE info.KEY = 'score' AND info.VALUE > 1000000
+ORDER BY id, info.value desc;
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ Although the <codeph>LIMIT</codeph> clause is now optional on <codeph>ORDER BY</codeph> queries, if your
+ query only needs some number of rows that you can predict in advance, use the <codeph>LIMIT</codeph> clause
+ to reduce unnecessary processing. For example, if the query has a clause <codeph>LIMIT 10</codeph>, each data
+ node sorts its portion of the relevant result set and only returns 10 rows to the coordinator node. The
+ coordinator node picks the 10 highest or lowest row values out of this small intermediate result set.
+ </p>
+
+ <p>
+ If an <codeph>ORDER BY</codeph> clause is applied to an early phase of query processing, such as a subquery
+ or a view definition, Impala ignores the <codeph>ORDER BY</codeph> clause. To get ordered results from a
+ subquery or view, apply an <codeph>ORDER BY</codeph> clause to the outermost or final <codeph>SELECT</codeph>
+ level.
+ </p>
+
+ <p>
+ <codeph>ORDER BY</codeph> is often used in combination with <codeph>LIMIT</codeph> to perform <q>top-N</q>
+ queries:
+ </p>
+
+<codeblock>SELECT user_id AS "Top 10 Visitors", SUM(page_views) FROM web_stats
+ GROUP BY page_views, user_id
+ ORDER BY SUM(page_views) DESC LIMIT 10;
+</codeblock>
+
+ <p>
+ <codeph>ORDER BY</codeph> is sometimes used in combination with <codeph>OFFSET</codeph> and
+ <codeph>LIMIT</codeph> to paginate query results, although it is relatively inefficient to issue multiple
+ queries like this against the large tables typically used with Impala:
+ </p>
+
+<codeblock>SELECT page_title AS "Page 1 of search results", page_url FROM search_content
+ WHERE LOWER(page_title) LIKE '%game%')
+ ORDER BY page_title LIMIT 10 OFFSET 0;
+SELECT page_title AS "Page 2 of search results", page_url FROM search_content
+ WHERE LOWER(page_title) LIKE '%game%')
+ ORDER BY page_title LIMIT 10 OFFSET 10;
+SELECT page_title AS "Page 3 of search results", page_url FROM search_content
+ WHERE LOWER(page_title) LIKE '%game%')
+ ORDER BY page_title LIMIT 10 OFFSET 20;
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/internals_blurb"/>
+
+ <p>
+ Impala sorts the intermediate results of an <codeph>ORDER BY</codeph> clause in memory whenever practical. In
+ a cluster of N data nodes, each node sorts roughly 1/Nth of the result set, the exact proportion varying
+ depending on how the data matching the query is distributed in HDFS.
+ </p>
+
+ <p>
+ If the size of the sorted intermediate result set on any data node would cause the query to exceed the Impala
+ memory limit, Impala sorts as much as practical in memory, then writes partially sorted data to disk. (This
+ technique is known in industry terminology as <q>external sorting</q> and <q>spilling to disk</q>.) As each
+ 8 MB batch of data is written to disk, Impala frees the corresponding memory to sort a new 8 MB batch of
+ data. When all the data has been processed, a final merge sort operation is performed to correctly order the
+ in-memory and on-disk results as the result set is transmitted back to the coordinator node. When external
+ sorting becomes necessary, Impala requires approximately 60 MB of RAM at a minimum for the buffers needed to
+ read, write, and sort the intermediate results. If more RAM is available on the data node, Impala will use
+ the additional RAM to minimize the amount of disk I/O for sorting.
+ </p>
+
+ <p>
+ This external sort technique is used as appropriate on each data node (possibly including the coordinator
+ node) to sort the portion of the result set that is processed on that node. When the sorted intermediate
+ results are sent back to the coordinator node to produce the final result set, the coordinator node uses a
+ merge sort technique to produce a final sorted result set without using any extra resources on the
+ coordinator node.
+ </p>
+
+ <p rev="obwl">
+ <b>Configuration for disk usage:</b>
+ </p>
+
+ <p rev="obwl" conref="../shared/impala_common.xml#common/order_by_scratch_dir"/>
+
+<!-- Here is actually the more logical place to collect all those examples, move them from SELECT and cross-reference to here. -->
+
+<!-- <p rev="obwl" conref="/Content/impala_common_xi44078.xml#common/restrictions_blurb"/> -->
+
+ <p rev="obwl" conref="../shared/impala_common.xml#common/insert_sort_blurb"/>
+
+ <p rev="obwl" conref="../shared/impala_common.xml#common/order_by_view_restriction"/>
+
+ <p>
+ With the lifting of the requirement to include a <codeph>LIMIT</codeph> clause in every <codeph>ORDER
+ BY</codeph> query (in Impala 1.4 and higher):
+ </p>
+
+ <ul>
+ <li>
+ <p>
+ Now the use of scratch disk space raises the possibility of an <q>out of disk space</q> error on a
+ particular data node, as opposed to the previous possibility of an <q>out of memory</q> error. Make sure
+ to keep at least 1 GB free on the filesystem used for temporary sorting work.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ The query options
+ <xref href="impala_default_order_by_limit.xml#default_order_by_limit">DEFAULT_ORDER_BY_LIMIT</xref> and
+ <xref href="impala_abort_on_default_limit_exceeded.xml#abort_on_default_limit_exceeded">ABORT_ON_DEFAULT_LIMIT_EXCEEDED</xref>,
+ which formerly controlled the behavior of <codeph>ORDER BY</codeph> queries with no limit specified, are
+ now ignored.
+ </p>
+ </li>
+ </ul>
+
+ <p rev="obwl" conref="../shared/impala_common.xml#common/null_sorting_change"/>
+<codeblock>[localhost:21000] > create table numbers (x int);
+[localhost:21000] > insert into numbers values (1), (null), (2), (null), (3);
+[localhost:21000] > select x from numbers order by x nulls first;
++------+
+| x |
++------+
+| NULL |
+| NULL |
+| 1 |
+| 2 |
+| 3 |
++------+
+[localhost:21000] > select x from numbers order by x desc nulls first;
++------+
+| x |
++------+
+| NULL |
+| NULL |
+| 3 |
+| 2 |
+| 1 |
++------+
+[localhost:21000] > select x from numbers order by x nulls last;
++------+
+| x |
++------+
+| 1 |
+| 2 |
+| 3 |
+| NULL |
+| NULL |
++------+
+[localhost:21000] > select x from numbers order by x desc nulls last;
++------+
+| x |
++------+
+| 3 |
+| 2 |
+| 1 |
+| NULL |
+| NULL |
++------+
+</codeblock>
+
+ <p rev="obwl" conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p rev="obwl">
+ See <xref href="impala_select.xml#select"/> for further examples of queries with the <codeph>ORDER
+ BY</codeph> clause.
+ </p>
+
+ <p>
+ Analytic functions use the <codeph>ORDER BY</codeph> clause in a different context to define the sequence in
+ which rows are analyzed. See <xref href="impala_analytic_functions.xml#analytic_functions"/> for details.
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_parquet_compression_codec.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_parquet_compression_codec.xml b/docs/topics/impala_parquet_compression_codec.xml
new file mode 100644
index 0000000..d178a0d
--- /dev/null
+++ b/docs/topics/impala_parquet_compression_codec.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="parquet_compression_codec">
+
+ <title>PARQUET_COMPRESSION_CODEC Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Parquet"/>
+ <data name="Category" value="File Formats"/>
+ <data name="Category" value="Impala Query Options"/>
+ <data name="Category" value="Deprecated Features"/>
+ <data name="Category" value="Compression"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">PARQUET_COMPRESSION_CODEC query option</indexterm>
+ Deprecated. Use <codeph>COMPRESSION_CODEC</codeph> in Impala 2.0 and later. See
+ <xref href="impala_compression_codec.xml#compression_codec"/> for details.
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_parquet_file_size.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_parquet_file_size.xml b/docs/topics/impala_parquet_file_size.xml
new file mode 100644
index 0000000..396fa92
--- /dev/null
+++ b/docs/topics/impala_parquet_file_size.xml
@@ -0,0 +1,82 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="parquet_block_size" id="parquet_file_size">
+
+ <title>PARQUET_FILE_SIZE Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Parquet"/>
+ <data name="Category" value="File Formats"/>
+ <data name="Category" value="Impala Query Options"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">PARQUET_FILE_SIZE query option</indexterm>
+ Specifies the maximum size of each Parquet data file produced by Impala <codeph>INSERT</codeph> statements.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+ <p>
+ Specify the size in bytes, or with a trailing <codeph>m</codeph> or <codeph>g</codeph> character to indicate
+ megabytes or gigabytes. For example:
+ </p>
+
+<codeblock>-- 128 megabytes.
+set PARQUET_FILE_SIZE=134217728
+INSERT OVERWRITE parquet_table SELECT * FROM text_table;
+
+-- 512 megabytes.
+set PARQUET_FILE_SIZE=512m;
+INSERT OVERWRITE parquet_table SELECT * FROM text_table;
+
+-- 1 gigabyte.
+set PARQUET_FILE_SIZE=1g;
+INSERT OVERWRITE parquet_table SELECT * FROM text_table;
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ With tables that are small or finely partitioned, the default Parquet block size (formerly 1 GB, now 256 MB
+ in Impala 2.0 and later) could be much larger than needed for each data file. For <codeph>INSERT</codeph>
+ operations into such tables, you can increase parallelism by specifying a smaller
+ <codeph>PARQUET_FILE_SIZE</codeph> value, resulting in more HDFS blocks that can be processed by different
+ nodes.
+<!-- Reducing the file size also reduces the memory required to buffer each block before writing it to disk. -->
+ </p>
+
+ <p>
+ <b>Type:</b> numeric, with optional unit specifier
+ </p>
+
+ <note type="important">
+ <p>
+ Currently, the maximum value for this setting is 1 gigabyte (<codeph>1g</codeph>).
+ Setting a value higher than 1 gigabyte could result in errors during
+ an <codeph>INSERT</codeph> operation.
+ </p>
+ </note>
+
+ <p>
+ <b>Default:</b> 0 (produces files with a target size of 256 MB; files might be larger for very wide tables)
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/isilon_blurb"/>
+ <p conref="../shared/impala_common.xml#common/isilon_block_size_caveat"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ For information about the Parquet file format, and how the number and size of data files affects query
+ performance, see <xref href="impala_parquet.xml#parquet"/>.
+ </p>
+
+<!-- Examples actually folded into Syntax earlier. <p conref="../shared/impala_common.xml#common/example_blurb"/> -->
+
+ </conbody>
+</concept>
[02/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Posted by jr...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_udf.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_udf.xml b/docs/topics/impala_udf.xml
new file mode 100644
index 0000000..53dd8eb
--- /dev/null
+++ b/docs/topics/impala_udf.xml
@@ -0,0 +1,1759 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.2" id="udfs">
+
+ <title>Impala User-Defined Functions (UDFs)</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="UDFs"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ User-defined functions (frequently abbreviated as UDFs) let you code your own application logic for
+ processing column values during an Impala query. For example, a UDF could perform calculations using an
+ external math library, combine several column values into one, do geospatial calculations, or other kinds of
+ tests and transformations that are outside the scope of the built-in SQL operators and functions.
+ </p>
+
+ <p>
+ You can use UDFs to simplify query logic when producing reports, or to transform data in flexible ways when
+ copying from one table to another with the <codeph>INSERT ... SELECT</codeph> syntax.
+ </p>
+
+ <p>
+ You might be familiar with this feature from other database products, under names such as stored functions or
+ stored routines.
+<!--
+ , user-defined aggregate functions (UDAFs), table functions, or window functions.
+ -->
+ </p>
+
+ <p>
+ Impala support for UDFs is available in Impala 1.2 and higher:
+ </p>
+
+ <ul>
+ <li>
+ In Impala 1.1, using UDFs in a query required using the Hive shell. (Because Impala and Hive share the same
+ metastore database, you could switch to Hive to run just those queries requiring UDFs, then switch back to
+ Impala.)
+ </li>
+
+ <li>
+ Starting in Impala 1.2, Impala can run both high-performance native code UDFs written in C++, and
+ Java-based Hive UDFs that you might already have written.
+ </li>
+
+ <li>
+ Impala can run scalar UDFs that return a single value for each row of the result set, and user-defined
+ aggregate functions (UDAFs) that return a value based on a set of rows. Currently, Impala does not support
+ user-defined table functions (UDTFs) or window functions.
+ </li>
+ </ul>
+
+ <p outputclass="toc inpage"/>
+ </conbody>
+
+ <concept id="udf_concepts">
+
+ <title>UDF Concepts</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Concepts"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ Depending on your use case, you might write all-new functions, reuse Java UDFs that you have already
+ written for Hive, or port Hive Java UDF code to higher-performance native Impala UDFs in C++. You can code
+ either scalar functions for producing results one row at a time, or more complex aggregate functions for
+ doing analysis across. The following sections discuss these different aspects of working with UDFs.
+ </p>
+
+ <p outputclass="toc inpage"/>
+ </conbody>
+
+ <concept id="udfs_udafs">
+
+ <title>UDFs and UDAFs</title>
+
+ <conbody>
+
+ <p>
+ Depending on your use case, the user-defined functions (UDFs) you write might accept or produce different
+ numbers of input and output values:
+ </p>
+
+ <ul>
+ <li>
+ The most general kind of user-defined function (the one typically referred to by the abbreviation UDF)
+ takes a single input value and produces a single output value. When used in a query, it is called once
+ for each row in the result set. For example:
+<codeblock>select customer_name, is_frequent_customer(customer_id) from customers;
+select obfuscate(sensitive_column) from sensitive_data;</codeblock>
+ </li>
+
+ <li>
+ A user-defined aggregate function (UDAF) accepts a group of values and returns a single value. You use
+ UDAFs to summarize and condense sets of rows, in the same style as the built-in <codeph>COUNT</codeph>,
+ <codeph>MAX()</codeph>, <codeph>SUM()</codeph>, and <codeph>AVG()</codeph> functions. When called in a
+ query that uses the <codeph>GROUP BY</codeph> clause, the function is called once for each combination
+ of <codeph>GROUP BY</codeph> values. For example:
+<codeblock>-- Evaluates multiple rows but returns a single value.
+select closest_restaurant(latitude, longitude) from places;
+
+-- Evaluates batches of rows and returns a separate value for each batch.
+select most_profitable_location(store_id, sales, expenses, tax_rate, depreciation) from franchise_data group by year;</codeblock>
+ </li>
+
+ <li>
+ Currently, Impala does not support other categories of user-defined functions, such as user-defined
+ table functions (UDTFs) or window functions.
+ </li>
+
+<!--
+<li>
+A user-defined table function (UDTF) returns an arbitrary number of rows (zero, one, or many) for each input row.
+These functions filter, explode, or transform the input data in a variety of ways.
+Currently, Impala does not support UDTFs.
+For example:
+<codeblock>select anomalous_event() from web_traffic;
+select price_change() from stock_ticker;
+select real_words(letters) from word_games;</codeblock>
+</li>
+-->
+ </ul>
+ </conbody>
+ </concept>
+
+ <concept id="native_udfs">
+
+ <title>Native Impala UDFs</title>
+
+ <conbody>
+
+ <p>
+ Impala supports UDFs written in C++, in addition to supporting existing Hive UDFs written in Java.
+ Cloudera recommends using C++ UDFs because the compiled native code can yield higher performance, with
+ UDF execution time often 10x faster for a C++ UDF than the equivalent Java UDF.
+ </p>
+ </conbody>
+ </concept>
+
+ <concept id="udfs_hive">
+
+ <title>Using Hive UDFs with Impala</title>
+
+ <conbody>
+
+ <p>
+ Impala can run Java-based user-defined functions (UDFs), originally written for Hive, with no changes,
+ subject to the following conditions:
+ </p>
+
+ <ul>
+ <li>
+ The parameters and return value must all use scalar data types supported by Impala. For example, complex or nested
+ types are not supported.
+ </li>
+
+ <li>
+ Currently, Hive UDFs that accept or return the <codeph>TIMESTAMP</codeph> type are not supported.
+ </li>
+
+ <li>
+ The return type must be a <q>Writable</q> type such as <codeph>Text</codeph> or
+ <codeph>IntWritable</codeph>, rather than a Java primitive type such as <codeph>String</codeph> or
+ <codeph>int</codeph>. Otherwise, the UDF will return <codeph>NULL</codeph>.
+ </li>
+
+ <li>
+ Hive UDAFs and UDTFs are not supported.
+ </li>
+
+ <li>
+ Typically, a Java UDF will execute several times slower in Impala than the equivalent native UDF
+ written in C++.
+ </li>
+ </ul>
+
+ <p>
+ To take full advantage of the Impala architecture and performance features, you can also write
+ Impala-specific UDFs in C++.
+ </p>
+
+ <p>
+ For background about Java-based Hive UDFs, see the
+ <xref href="https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF" scope="external" format="html">Hive
+ documentation for UDFs</xref>. For examples or tutorials for writing such UDFs, search the web for
+ related blog posts.
+ </p>
+
+ <p>
+ The ideal way to understand how to reuse Java-based UDFs (originally written for Hive) with Impala is to
+ take some of the Hive built-in functions (implemented as Java UDFs) and take the applicable JAR files
+ through the UDF deployment process for Impala, creating new UDFs with different names:
+ </p>
+
+ <ol>
+ <li>
+ Take a copy of the Hive JAR file containing the Hive built-in functions. For example, the path might be
+ like <filepath>/usr/lib/hive/lib/hive-exec-0.10.0-cdh4.2.0.jar</filepath>, with different version
+ numbers corresponding to your specific level of CDH.
+ </li>
+
+ <li>
+ Use <codeph>jar tf <varname>jar_file</varname></codeph> to see a list of the classes inside the JAR.
+ You will see names like <codeph>org/apache/hadoop/hive/ql/udf/UDFLower.class</codeph> and
+ <codeph>org/apache/hadoop/hive/ql/udf/UDFOPNegative.class</codeph>. Make a note of the names of the
+ functions you want to experiment with. When you specify the entry points for the Impala <codeph>CREATE
+ FUNCTION</codeph> statement, change the slash characters to dots and strip off the
+ <codeph>.class</codeph> suffix, for example <codeph>org.apache.hadoop.hive.ql.udf.UDFLower</codeph> and
+ <codeph>org.apache.hadoop.hive.ql.udf.UDFOPNegative</codeph>.
+ </li>
+
+ <li>
+ Copy that file to an HDFS location that Impala can read. (In the examples here, we renamed the file to
+ <filepath>hive-builtins.jar</filepath> in HDFS for simplicity.)
+ </li>
+
+ <li>
+ For each Java-based UDF that you want to call through Impala, issue a <codeph>CREATE FUNCTION</codeph>
+ statement, with a <codeph>LOCATION</codeph> clause containing the full HDFS path of the JAR file, and a
+ <codeph>SYMBOL</codeph> clause with the fully qualified name of the class, using dots as separators and
+ without the <codeph>.class</codeph> extension. Remember that user-defined functions are associated with
+ a particular database, so issue a <codeph>USE</codeph> statement for the appropriate database first, or
+ specify the SQL function name as
+ <codeph><varname>db_name</varname>.<varname>function_name</varname></codeph>. Use completely new names
+ for the SQL functions, because Impala UDFs cannot have the same name as Impala built-in functions.
+ </li>
+
+ <li>
+ Call the function from your queries, passing arguments of the correct type to match the function
+ signature. These arguments could be references to columns, arithmetic or other kinds of expressions,
+ the results of <codeph>CAST</codeph> functions to ensure correct data types, and so on.
+ </li>
+ </ol>
+
+ <example>
+
+ <title>Java UDF Example: Reusing lower() Function</title>
+
+ <p>
+ For example, the following <cmdname>impala-shell</cmdname> session creates an Impala UDF
+ <codeph>my_lower()</codeph> that reuses the Java code for the Hive <codeph>lower()</codeph>: built-in
+ function. We cannot call it <codeph>lower()</codeph> because Impala does not allow UDFs to have the
+ same name as built-in functions. From SQL, we call the function in a basic way (in a query with no
+ <codeph>WHERE</codeph> clause), directly on a column, and on the results of a string expression:
+ </p>
+
+<codeblock>[localhost:21000] > create database udfs;
+[localhost:21000] > use udfs;
+localhost:21000] > create function lower(string) returns string location '/user/hive/udfs/hive.jar' symbol='org.apache.hadoop.hive.ql.udf.UDFLower';
+ERROR: AnalysisException: Function cannot have the same name as a builtin: lower
+[localhost:21000] > create function my_lower(string) returns string location '/user/hive/udfs/hive.jar' symbol='org.apache.hadoop.hive.ql.udf.UDFLower';
+[localhost:21000] > select my_lower('Some String NOT ALREADY LOWERCASE');
++----------------------------------------------------+
+| udfs.my_lower('some string not already lowercase') |
++----------------------------------------------------+
+| some string not already lowercase |
++----------------------------------------------------+
+Returned 1 row(s) in 0.11s
+[localhost:21000] > create table t2 (s string);
+[localhost:21000] > insert into t2 values ('lower'),('UPPER'),('Init cap'),('CamelCase');
+Inserted 4 rows in 2.28s
+[localhost:21000] > select * from t2;
++-----------+
+| s |
++-----------+
+| lower |
+| UPPER |
+| Init cap |
+| CamelCase |
++-----------+
+Returned 4 row(s) in 0.47s
+[localhost:21000] > select my_lower(s) from t2;
++------------------+
+| udfs.my_lower(s) |
++------------------+
+| lower |
+| upper |
+| init cap |
+| camelcase |
++------------------+
+Returned 4 row(s) in 0.54s
+[localhost:21000] > select my_lower(concat('ABC ',s,' XYZ')) from t2;
++------------------------------------------+
+| udfs.my_lower(concat('abc ', s, ' xyz')) |
++------------------------------------------+
+| abc lower xyz |
+| abc upper xyz |
+| abc init cap xyz |
+| abc camelcase xyz |
++------------------------------------------+
+Returned 4 row(s) in 0.22s</codeblock>
+
+ </example>
+
+ <example>
+
+ <title>Java UDF Example: Reusing negative() Function</title>
+
+ <p>
+ Here is an example that reuses the Hive Java code for the <codeph>negative()</codeph> built-in
+ function. This example demonstrates how the data types of the arguments must match precisely with the
+ function signature. At first, we create an Impala SQL function that can only accept an integer
+ argument. Impala cannot find a matching function when the query passes a floating-point argument,
+ although we can call the integer version of the function by casting the argument. Then we overload the
+ same function name to also accept a floating-point argument.
+ </p>
+
+<codeblock>[localhost:21000] > create table t (x int);
+[localhost:21000] > insert into t values (1), (2), (4), (100);
+Inserted 4 rows in 1.43s
+[localhost:21000] > create function my_neg(bigint) returns bigint location '/user/hive/udfs/hive.jar' symbol='org.apache.hadoop.hive.ql.udf.UDFOPNegative';
+[localhost:21000] > select my_neg(4);
++----------------+
+| udfs.my_neg(4) |
++----------------+
+| -4 |
++----------------+
+[localhost:21000] > select my_neg(x) from t;
++----------------+
+| udfs.my_neg(x) |
++----------------+
+| -2 |
+| -4 |
+| -100 |
++----------------+
+Returned 3 row(s) in 0.60s
+[localhost:21000] > select my_neg(4.0);
+ERROR: AnalysisException: No matching function with signature: udfs.my_neg(FLOAT).
+[localhost:21000] > select my_neg(cast(4.0 as int));
++-------------------------------+
+| udfs.my_neg(cast(4.0 as int)) |
++-------------------------------+
+| -4 |
++-------------------------------+
+Returned 1 row(s) in 0.11s
+[localhost:21000] > create function my_neg(double) returns double location '/user/hive/udfs/hive.jar' symbol='org.apache.hadoop.hive.ql.udf.UDFOPNegative';
+[localhost:21000] > select my_neg(4.0);
++------------------+
+| udfs.my_neg(4.0) |
++------------------+
+| -4 |
++------------------+
+Returned 1 row(s) in 0.11s</codeblock>
+
+ <p>
+ You can find the sample files mentioned here in
+ <xref href="https://github.com/cloudera/impala/tree/master/be/src/udf_samples" scope="external" format="html">the
+ Impala github repo</xref>.
+<!-- Internal-only repo, don't know an external equivalent.
+and other examples demonstrating this technique in
+<xref href="http://github.sf.cloudera.com/CDH/Impala/blob/master/testdata/workloads/functional-query/queries/QueryTest/load-hive-udfs.test" scope="external" format="html">the Impala test files</xref>.
+-->
+ </p>
+
+ </example>
+ </conbody>
+ </concept>
+ </concept>
+
+ <concept id="udf_runtime">
+ <title>Runtime Environment for UDFs</title>
+ <conbody>
+ <p>
+ By default, Impala copies UDFs into <filepath>/tmp</filepath>,
+ and you can configure this location through the <codeph>--local_library_dir</codeph>
+ startup flag for the <cmdname>impalad</cmdname> daemon.
+ </p>
+ </conbody>
+ </concept>
+
+
+ <concept id="udf_demo_env">
+
+ <title>Installing the UDF Development Package</title>
+
+ <conbody>
+
+ <p>
+ To develop UDFs for Impala, download and install the <codeph>impala-udf-devel</codeph> package containing
+ header files, sample source, and build configuration files.
+ </p>
+
+ <ol>
+ <li>
+ Start at <xref href="https://archive.cloudera.com/cdh5/" scope="external" format="html"/> for the CDH 5
+ package, or <xref href="https://archive.cloudera.com/impala/" scope="external" format="html"/> for the CDH
+ 4 package.
+ </li>
+
+ <li>
+ Locate the appropriate <codeph>.repo</codeph> or list file for your operating system version, such as
+ <xref href="https://archive.cloudera.com/impala/redhat/6/x86_64/impala/cloudera-impala.repo" scope="external" format="html">the
+ <codeph>.repo</codeph> file for CDH 4 on RHEL 6</xref>.
+ </li>
+
+ <li>
+ Use the familiar <codeph>yum</codeph>, <codeph>zypper</codeph>, or <codeph>apt-get</codeph> commands
+ depending on your operating system, with <codeph>impala-udf-devel</codeph> for the package name.
+ </li>
+ </ol>
+
+ <note>
+ The UDF development code does not rely on Impala being installed on the same machine. You can write and
+ compile UDFs on a minimal development system, then deploy them on a different one for use with Impala. If
+ you develop UDFs on a server managed by Cloudera Manager through the parcel mechanism, you still install
+ the UDF development kit through the package mechanism; this small standalone package does not interfere
+ with the parcels containing the main Impala code.
+ </note>
+
+ <p>
+ When you are ready to start writing your own UDFs, download the sample code and build scripts from
+ <xref href="https://github.com/cloudera/impala-udf-samples" scope="external" format="html">the Cloudera
+ sample UDF github</xref>. Then see <xref href="impala_udf.xml#udf_coding"/> for how to code UDFs, and
+ <xref href="impala_udf.xml#udf_tutorial"/> for how to build and run UDFs.
+ </p>
+ </conbody>
+ </concept>
+
+ <concept id="udf_coding">
+
+ <title>Writing User-Defined Functions (UDFs)</title>
+
+ <conbody>
+
+ <p>
+ Before starting UDF development, make sure to install the development package and download the UDF code
+ samples, as described in <xref href="#udf_demo_env"/>.
+ </p>
+
+ <p>
+ When writing UDFs:
+ </p>
+
+ <ul>
+ <li>
+ Keep in mind the data type differences as you transfer values from the high-level SQL to your lower-level
+ UDF code. For example, in the UDF code you might be much more aware of how many bytes different kinds of
+ integers require.
+ </li>
+
+ <li>
+ Use best practices for function-oriented programming: choose arguments carefully, avoid side effects,
+ make each function do a single thing, and so on.
+ </li>
+ </ul>
+
+ <p outputclass="toc inpage"/>
+ </conbody>
+
+ <concept id="udf_exploring">
+
+ <title>Getting Started with UDF Coding</title>
+ <prolog>
+ <metadata>
+ <!-- OK, this is not something a Hadoop newbie would tackle, but being lenient and inclusive in this initial pass, so including the GS tag. -->
+ <data name="Category" value="Getting Started"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ To understand the layout and member variables and functions of the predefined UDF data types, examine the
+ header file <filepath>/usr/include/impala_udf/udf.h</filepath>:
+ </p>
+
+<codeblock>// This is the only Impala header required to develop UDFs and UDAs. This header
+// contains the types that need to be used and the FunctionContext object. The context
+// object serves as the interface object between the UDF/UDA and the impala process. </codeblock>
+
+ <p>
+ For the basic declarations needed to write a scalar UDF, see the header file
+ <filepath>udf-sample.h</filepath> within the sample build environment, which defines a simple function
+ named <codeph>AddUdf()</codeph>:
+ </p>
+
+<codeblock>#ifndef IMPALA_UDF_SAMPLE_UDF_H
+#define IMPALA_UDF_SAMPLE_UDF_H
+
+#include <impala_udf/udf.h>
+
+using namespace impala_udf;
+
+IntVal AddUdf(FunctionContext* context, const IntVal& arg1, const IntVal& arg2);
+
+#endif</codeblock>
+
+ <p>
+ For sample C++ code for a simple function named <codeph>AddUdf()</codeph>, see the source file
+ <filepath>udf-sample.cc</filepath> within the sample build environment:
+ </p>
+
+<codeblock>#include "udf-sample.h"
+
+// In this sample we are declaring a UDF that adds two ints and returns an int.
+IntVal AddUdf(FunctionContext* context, const IntVal& arg1, const IntVal& arg2) {
+ if (arg1.is_null || arg2.is_null) return IntVal::null();
+ return IntVal(arg1.val + arg2.val);
+}
+
+// Multiple UDFs can be defined in the same file</codeblock>
+ </conbody>
+ </concept>
+
+ <concept id="udfs_args">
+
+ <title>Data Types for Function Arguments and Return Values</title>
+
+ <conbody>
+
+ <p>
+ Each value that a user-defined function can accept as an argument or return as a result value must map to
+ a SQL data type that you could specify for a table column.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/udfs_no_complex_types"/>
+
+ <p>
+ Each data type has a corresponding structure defined in the C++ and Java header files, with two member
+ fields and some predefined comparison operators and constructors:
+ </p>
+
+ <ul>
+ <li>
+ <p>
+ <codeph>is_null</codeph> indicates whether the value is <codeph>NULL</codeph> or not.
+ <codeph>val</codeph> holds the actual argument or return value when it is non-<codeph>NULL</codeph>.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Each struct also defines a <codeph>null()</codeph> member function that constructs an instance of the
+ struct with the <codeph>is_null</codeph> flag set.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ The built-in SQL comparison operators and clauses such as <codeph><</codeph>,
+ <codeph>>=</codeph>, <codeph>BETWEEN</codeph>, and <codeph>ORDER BY</codeph> all work
+ automatically based on the SQL return type of each UDF. For example, Impala knows how to evaluate
+ <codeph>BETWEEN 1 AND udf_returning_int(col1)</codeph> or <codeph>ORDER BY
+ udf_returning_string(col2)</codeph> without you declaring any comparison operators within the UDF
+ itself.
+ </p>
+ <p>
+ For convenience within your UDF code, each struct defines <codeph>==</codeph> and <codeph>!=</codeph>
+ operators for comparisons with other structs of the same type. These are for typical C++ comparisons
+ within your own code, not necessarily reproducing SQL semantics. For example, if the
+ <codeph>is_null</codeph> flag is set in both structs, they compare as equal. That behavior of
+ <codeph>null</codeph> comparisons is different from SQL (where <codeph>NULL == NULL</codeph> is
+ <codeph>NULL</codeph> rather than <codeph>true</codeph>), but more in line with typical C++ behavior.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Each kind of struct has one or more constructors that define a filled-in instance of the struct,
+ optionally with default values.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Each kind of struct has a <codeph>null()</codeph> member function that returns an instance of the
+ struct with the <codeph>is_null</codeph> flag set.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Because Impala currently does not support composite or nested types, Impala cannot process UDFs that
+ accept such types as arguments or return them as result values. This limitation applies both to
+ Impala UDFs written in C++ and Java-based Hive UDFs.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ You can overload functions by creating multiple functions with the same SQL name but different
+ argument types. For overloaded functions, you must use different C++ or Java entry point names in the
+ underlying functions.
+ </p>
+ </li>
+ </ul>
+
+ <p>
+ The data types defined on the C++ side (in <filepath>/usr/include/impala_udf/udf.h</filepath>) are:
+ </p>
+
+ <ul>
+ <li>
+ <p>
+ <codeph>IntVal</codeph> represents an <codeph>INT</codeph> column.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ <codeph>BigIntVal</codeph> represents a <codeph>BIGINT</codeph> column. Even if you do not need the
+ full range of a <codeph>BIGINT</codeph> value, it can be useful to code your function arguments as
+ <codeph>BigIntVal</codeph> to make it convenient to call the function with different kinds of integer
+ columns and expressions as arguments. Impala automatically casts smaller integer types to larger ones
+ when appropriate, but does not implicitly cast large integer types to smaller ones.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ <codeph>SmallIntVal</codeph> represents a <codeph>SMALLINT</codeph> column.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ <codeph>TinyIntVal</codeph> represents a <codeph>TINYINT</codeph> column.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ <codeph>StringVal</codeph> represents a <codeph>STRING</codeph> column. It has a <codeph>len</codeph>
+ field representing the length of the string, and a <codeph>ptr</codeph> field pointing to the string
+ data. It has constructors that create a new <codeph>StringVal</codeph> struct based on a
+ null-terminated C-style string, or a pointer plus a length; these new structs still refer to the
+ original string data rather than allocating a new buffer for the data. It also has a constructor that
+ takes a pointer to a <codeph>FunctionContext</codeph> struct and a length, that does allocate space
+ for a new copy of the string data, for use in UDFs that return string values.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ <codeph>BooleanVal</codeph> represents a <codeph>BOOLEAN</codeph> column.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ <codeph>FloatVal</codeph> represents a <codeph>FLOAT</codeph> column.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ <codeph>DoubleVal</codeph> represents a <codeph>DOUBLE</codeph> column.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ <codeph>TimestampVal</codeph> represents a <codeph>TIMESTAMP</codeph> column. It has a
+ <codeph>date</codeph> field, a 32-bit integer representing the Gregorian date, that is, the days past
+ the epoch date. It also has a <codeph>time_of_day</codeph> field, a 64-bit integer representing the
+ current time of day in nanoseconds.
+ </p>
+ </li>
+
+<!--
+ <li>
+ <p>
+ <codeph>AnyVal</codeph> is the parent type of all the other
+ structs. They inherit the <codeph>is_null</codeph> field from it.
+ You do not use this type directly in your code.
+ </p>
+ </li>
+-->
+ </ul>
+ </conbody>
+ </concept>
+
+ <concept id="udf_varargs">
+
+ <title>Variable-Length Argument Lists</title>
+
+ <conbody>
+
+ <p>
+ UDFs typically take a fixed number of arguments, with each one named explicitly in the signature of your
+ C++ function. Your function can also accept additional optional arguments, all of the same type. For
+ example, you can concatenate two strings, three strings, four strings, and so on. Or you can compare two
+ numbers, three numbers, four numbers, and so on.
+ </p>
+
+ <p>
+ To accept a variable-length argument list, code the signature of your function like this:
+ </p>
+
+<codeblock>StringVal Concat(FunctionContext* context, const StringVal& separator,
+ int num_var_args, const StringVal* args);</codeblock>
+
+ <p>
+ In the <codeph>CREATE FUNCTION</codeph> statement, after the type of the first optional argument, include
+ <codeph>...</codeph> to indicate it could be followed by more arguments of the same type. For example,
+ the following function accepts a <codeph>STRING</codeph> argument, followed by one or more additional
+ <codeph>STRING</codeph> arguments:
+ </p>
+
+<codeblock>[localhost:21000] > create function my_concat(string, string ...) returns string location '/user/test_user/udfs/sample.so' symbol='Concat';
+</codeblock>
+
+ <p>
+ The call from the SQL query must pass at least one argument to the variable-length portion of the
+ argument list.
+ </p>
+
+ <p>
+ When Impala calls the function, it fills in the initial set of required arguments, then passes the number
+ of extra arguments and a pointer to the first of those optional arguments.
+ </p>
+ </conbody>
+ </concept>
+
+ <concept id="udf_null">
+
+ <title>Handling NULL Values</title>
+
+ <conbody>
+
+ <p>
+ For correctness, performance, and reliability, it is important for each UDF to handle all situations
+ where any <codeph>NULL</codeph> values are passed to your function. For example, when passed a
+ <codeph>NULL</codeph>, UDFs typically also return <codeph>NULL</codeph>. In an aggregate function, which
+ could be passed a combination of real and <codeph>NULL</codeph> values, you might make the final value
+ into a <codeph>NULL</codeph> (as in <codeph>CONCAT()</codeph>), ignore the <codeph>NULL</codeph> value
+ (as in <codeph>AVG()</codeph>), or treat it the same as a numeric zero or empty string.
+ </p>
+
+ <p>
+ Each parameter type, such as <codeph>IntVal</codeph> or <codeph>StringVal</codeph>, has an
+ <codeph>is_null</codeph> Boolean member.
+<!--
+If your function has no effect when passed <codeph>NULL</codeph>
+values,
+-->
+ Test this flag immediately for each argument to your function, and if it is set, do not refer to the
+ <codeph>val</codeph> field of the argument structure. The <codeph>val</codeph> field is undefined when
+ the argument is <codeph>NULL</codeph>, so your function could go into an infinite loop or produce
+ incorrect results if you skip the special handling for <codeph>NULL</codeph>.
+<!-- and return if so.
+For <codeph>void</codeph> intermediate functions
+within UDAs, you can return without specifying a value.
+-->
+ </p>
+
+ <p>
+ If your function returns <codeph>NULL</codeph> when passed a <codeph>NULL</codeph> value, or in other
+ cases such as when a search string is not found, you can construct a null instance of the return type by
+ using its <codeph>null()</codeph> member function.
+ </p>
+ </conbody>
+ </concept>
+
+ <concept id="udf_malloc">
+
+ <title>Memory Allocation for UDFs</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Memory"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ By default, memory allocated within a UDF is deallocated when the function exits, which could be before
+ the query is finished. The input arguments remain allocated for the lifetime of the function, so you can
+ refer to them in the expressions for your return values. If you use temporary variables to construct
+ all-new string values, use the <codeph>StringVal()</codeph> constructor that takes an initial
+ <codeph>FunctionContext*</codeph> argument followed by a length, and copy the data into the newly
+ allocated memory buffer.
+ </p>
+ </conbody>
+ </concept>
+
+ <concept rev="1.3.0" id="udf_threads">
+
+ <title>Thread-Safe Work Area for UDFs</title>
+
+ <conbody>
+
+ <p>
+ One way to improve performance of UDFs is to specify the optional <codeph>PREPARE_FN</codeph> and
+ <codeph>CLOSE_FN</codeph> clauses on the <codeph>CREATE FUNCTION</codeph> statement. The <q>prepare</q>
+ function sets up a thread-safe data structure in memory that you can use as a work area. The <q>close</q>
+ function deallocates that memory. Each subsequent call to the UDF within the same thread can access that
+ same memory area. There might be several such memory areas allocated on the same host, as UDFs are
+ parallelized using multiple threads.
+ </p>
+
+ <p>
+ Within this work area, you can set up predefined lookup tables, or record the results of complex
+ operations on data types such as <codeph>STRING</codeph> or <codeph>TIMESTAMP</codeph>. Saving the
+ results of previous computations rather than repeating the computation each time is an optimization known
+ as <xref href="http://en.wikipedia.org/wiki/Memoization" scope="external" format="html"/>. For example,
+ if your UDF performs a regular expression match or date manipulation on a column that repeats the same
+ value over and over, you could store the last-computed value or a hash table of already-computed values,
+ and do a fast lookup to find the result for subsequent iterations of the UDF.
+ </p>
+
+ <p>
+ Each such function must have the signature:
+ </p>
+
+<codeblock>void <varname>function_name</varname>(impala_udf::FunctionContext*, impala_udf::FunctionContext::FunctionScope)
+</codeblock>
+
+ <p>
+ Currently, only <codeph>THREAD_SCOPE</codeph> is implemented, not <codeph>FRAGMENT_SCOPE</codeph>. See
+ <filepath>udf.h</filepath> for details about the scope values.
+ </p>
+ </conbody>
+ </concept>
+
+ <concept id="udf_error_handling">
+
+ <title>Error Handling for UDFs</title>
+ <prolog>
+ <metadata>
+ <!-- A little bit of a stretch, but if you're doing UDFs and you need to debug you might look up Troubleshooting. -->
+ <data name="Category" value="Troubleshooting"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ To handle errors in UDFs, you call functions that are members of the initial
+ <codeph>FunctionContext*</codeph> argument passed to your function.
+ </p>
+
+ <p>
+ A UDF can record one or more warnings, for conditions that indicate minor, recoverable problems that do
+ not cause the query to stop. The signature for this function is:
+ </p>
+
+<codeblock>bool AddWarning(const char* warning_msg);</codeblock>
+
+ <p>
+ For a serious problem that requires cancelling the query, a UDF can set an error flag that prevents the
+ query from returning any results. The signature for this function is:
+ </p>
+
+<codeblock>void SetError(const char* error_msg);</codeblock>
+ </conbody>
+ </concept>
+ </concept>
+
+ <concept id="udafs">
+
+ <title>Writing User-Defined Aggregate Functions (UDAFs)</title>
+
+ <conbody>
+
+ <p>
+ User-defined aggregate functions (UDAFs or UDAs) are a powerful and flexible category of user-defined
+ functions. If a query processes N rows, calling a UDAF during the query condenses the result set, anywhere
+ from a single value (such as with the <codeph>SUM</codeph> or <codeph>MAX</codeph> functions), or some
+ number less than or equal to N (as in queries using the <codeph>GROUP BY</codeph> or
+ <codeph>HAVING</codeph> clause).
+ </p>
+
+ <p outputclass="toc inpage"/>
+ </conbody>
+
+ <concept id="uda_functions">
+
+ <title>The Underlying Functions for a UDA</title>
+
+ <conbody>
+
+ <p>
+ A UDAF must maintain a state value across subsequent calls, so that it can accumulate a result across a
+ set of calls, rather than derive it purely from one set of arguments. For that reason, a UDAF is
+ represented by multiple underlying functions:
+ </p>
+
+ <ul>
+ <li>
+ An initialization function that sets any counters to zero, creates empty buffers, and does any other
+ one-time setup for a query.
+ </li>
+
+ <li>
+ An update function that processes the arguments for each row in the query result set and accumulates an
+ intermediate result for each node. For example, this function might increment a counter, append to a
+ string buffer, or set flags.
+ </li>
+
+ <li>
+ A merge function that combines the intermediate results from two different nodes.
+ </li>
+
+ <li rev="2.0.0">
+ A serialize function that flattens any intermediate values containing pointers, and frees any memory
+ allocated during the init, update, and merge phases.
+ </li>
+
+ <li>
+ A finalize function that either passes through the combined result unchanged, or does one final
+ transformation.
+ </li>
+ </ul>
+
+ <p>
+ In the SQL syntax, you create a UDAF by using the statement <codeph>CREATE AGGREGATE FUNCTION</codeph>.
+ You specify the entry points of the underlying C++ functions using the clauses <codeph>INIT_FN</codeph>,
+ <codeph>UPDATE_FN</codeph>, <codeph>MERGE_FN</codeph>, <codeph rev="2.0.0">SERIALIZE_FN</codeph>, and
+ <codeph>FINALIZE_FN</codeph>.
+ </p>
+
+ <p>
+ <draft-comment translate="no">
+Need an example to demonstrate exactly what tokens are used for init, merge, finalize in
+this substitution.
+</draft-comment>
+ For convenience, you can use a naming convention for the underlying functions and Impala automatically
+ recognizes those entry points. Specify the <codeph>UPDATE_FN</codeph> clause, using an entry point name
+ containing the string <codeph>update</codeph> or <codeph>Update</codeph>. When you omit the other
+ <codeph>_FN</codeph> clauses from the SQL statement, Impala looks for entry points with names formed by
+ substituting the <codeph>update</codeph> or <codeph>Update</codeph> portion of the specified name.
+ </p>
+
+<!--
+[INIT_FN '<varname>function</varname>]
+[UPDATE_FN '<varname>function</varname>]
+[MERGE_FN '<varname>function</varname>]
+[FINALIZE_FN '<varname>function</varname>]
+-->
+
+ <p>
+ <filepath>uda-sample.h</filepath>:
+ </p>
+
+ <p>
+ See this file online at:
+ <xref href="https://github.com/cloudera/impala-udf-samples/blob/master/uda-sample.cc" scope="external" format="html"/>
+ </p>
+
+<codeblock audience="Cloudera">#ifndef IMPALA_UDF_SAMPLE_UDA_H
+#define IMPALA_UDF_SAMPLE_UDA_H
+
+#include <impala_udf/udf.h>
+
+using namespace impala_udf;
+
+// This is an example of the COUNT aggregate function.
+void CountInit(FunctionContext* context, BigIntVal* val);
+void CountUpdate(FunctionContext* context, const AnyVal& input, BigIntVal* val);
+void CountMerge(FunctionContext* context, const BigIntVal& src, BigIntVal* dst);
+BigIntVal CountFinalize(FunctionContext* context, const BigIntVal& val);
+
+// This is an example of the AVG(double) aggregate function. This function needs to
+// maintain two pieces of state, the current sum and the count. We do this using
+// the BufferVal intermediate type. When this UDA is registered, it would specify
+// 16 bytes (8 byte sum + 8 byte count) as the size for this buffer.
+void AvgInit(FunctionContext* context, BufferVal* val);
+void AvgUpdate(FunctionContext* context, const DoubleVal& input, BufferVal* val);
+void AvgMerge(FunctionContext* context, const BufferVal& src, BufferVal* dst);
+DoubleVal AvgFinalize(FunctionContext* context, const BufferVal& val);
+
+// This is a sample of implementing the STRING_CONCAT aggregate function.
+// Example: select string_concat(string_col, ",") from table
+void StringConcatInit(FunctionContext* context, StringVal* val);
+void StringConcatUpdate(FunctionContext* context, const StringVal& arg1,
+ const StringVal& arg2, StringVal* val);
+void StringConcatMerge(FunctionContext* context, const StringVal& src, StringVal* dst);
+StringVal StringConcatFinalize(FunctionContext* context, const StringVal& val);
+
+#endif</codeblock>
+
+ <p>
+ <filepath>uda-sample.cc</filepath>:
+ </p>
+
+ <p>
+ See this file online at:
+ <xref href="https://github.com/cloudera/impala-udf-samples/blob/master/uda-sample.h" scope="external" format="html"/>
+ </p>
+
+<codeblock audience="Cloudera">#include "uda-sample.h"
+#include <assert.h>
+
+using namespace impala_udf;
+
+// ---------------------------------------------------------------------------
+// This is a sample of implementing a COUNT aggregate function.
+// ---------------------------------------------------------------------------
+void CountInit(FunctionContext* context, BigIntVal* val) {
+ val->is_null = false;
+ val->val = 0;
+}
+
+void CountUpdate(FunctionContext* context, const AnyVal& input, BigIntVal* val) {
+ if (input.is_null) return;
+ ++val->val;
+}
+
+void CountMerge(FunctionContext* context, const BigIntVal& src, BigIntVal* dst) {
+ dst->val += src.val;
+}
+
+BigIntVal CountFinalize(FunctionContext* context, const BigIntVal& val) {
+ return val;
+}
+
+// ---------------------------------------------------------------------------
+// This is a sample of implementing an AVG aggregate function.
+// ---------------------------------------------------------------------------
+struct AvgStruct {
+ double sum;
+ int64_t count;
+};
+
+void AvgInit(FunctionContext* context, BufferVal* val) {
+ assert(sizeof(AvgStruct) == 16);
+ memset(*val, 0, sizeof(AvgStruct));
+}
+
+void AvgUpdate(FunctionContext* context, const DoubleVal& input, BufferVal* val) {
+ if (input.is_null) return;
+ AvgStruct* avg = reinterpret_cast<AvgStruct*>(*val);
+ avg->sum += input.val;
+ ++avg->count;
+}
+
+void AvgMerge(FunctionContext* context, const BufferVal& src, BufferVal* dst) {
+ if (src == NULL) return;
+ const AvgStruct* src_struct = reinterpret_cast<const AvgStruct*>(src);
+ AvgStruct* dst_struct = reinterpret_cast<AvgStruct*>(*dst);
+ dst_struct->sum += src_struct->sum;
+ dst_struct->count += src_struct->count;
+}
+
+DoubleVal AvgFinalize(FunctionContext* context, const BufferVal& val) {
+ if (val == NULL) return DoubleVal::null();
+ AvgStruct* val_struct = reinterpret_cast<AvgStruct*>(val);
+ return DoubleVal(val_struct->sum / val_struct->count);
+}
+
+// ---------------------------------------------------------------------------
+// This is a sample of implementing the STRING_CONCAT aggregate function.
+// Example: select string_concat(string_col, ",") from table
+// ---------------------------------------------------------------------------
+void StringConcatInit(FunctionContext* context, StringVal* val) {
+ val->is_null = true;
+}
+
+void StringConcatUpdate(FunctionContext* context, const StringVal& arg1,
+ const StringVal& arg2, StringVal* val) {
+ if (val->is_null) {
+ val->is_null = false;
+ *val = StringVal(context, arg1.len);
+ memcpy(val->ptr, arg1.ptr, arg1.len);
+ } else {
+ int new_len = val->len + arg1.len + arg2.len;
+ StringVal new_val(context, new_len);
+ memcpy(new_val.ptr, val->ptr, val->len);
+ memcpy(new_val.ptr + val->len, arg2.ptr, arg2.len);
+ memcpy(new_val.ptr + val->len + arg2.len, arg1.ptr, arg1.len);
+ *val = new_val;
+ }
+}
+
+void StringConcatMerge(FunctionContext* context, const StringVal& src, StringVal* dst) {
+ if (src.is_null) return;
+ StringConcatUpdate(context, src, ",", dst);
+}
+
+StringVal StringConcatFinalize(FunctionContext* context, const StringVal& val) {
+ return val;
+}</codeblock>
+ </conbody>
+ </concept>
+
+ <concept audience="Cloudera" id="udf_intermediate">
+
+ <title>Intermediate Results for UDAs</title>
+
+ <conbody>
+
+ <p>
+ A user-defined aggregate function might produce and combine intermediate results during some phases of
+ processing, using a different data type than the final return value. For example, if you implement a
+ function similar to the built-in <codeph>AVG()</codeph> function, it must keep track of two values, the
+ number of values counted and the sum of those values. Or, you might accumulate a string value over the
+ course of a UDA, then in the end return a numeric or Boolean result.
+ </p>
+
+ <p>
+ In such a case, specify the data type of the intermediate results using the optional <codeph>INTERMEDIATE
+ <varname>type_name</varname></codeph> clause of the <codeph>CREATE AGGREGATE FUNCTION</codeph> statement.
+ If the intermediate data is a typeless byte array (for example, to represent a C++ struct or array),
+ specify the type name as <codeph>CHAR(<varname>n</varname>)</codeph>, with <varname>n</varname>
+ representing the number of bytes in the intermediate result buffer.
+ </p>
+ </conbody>
+ </concept>
+ </concept>
+
+ <concept id="udf_building">
+
+ <title>Building and Deploying UDFs</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Deploying"/>
+ <data name="Category" value="Building"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ This section explains the steps to compile Impala UDFs from C++ source code, and deploy the resulting
+ libraries for use in Impala queries.
+ </p>
+
+ <p>
+ Impala ships with a sample build environment for UDFs, that you can study, experiment with, and adapt for
+ your own use. This sample build environment starts with the <cmdname>cmake</cmdname> configuration command,
+ which reads the file <filepath>CMakeLists.txt</filepath> and generates a <filepath>Makefile</filepath>
+ customized for your particular directory paths. Then the <cmdname>make</cmdname> command runs the actual
+ build steps based on the rules in the <filepath>Makefile</filepath>.
+ </p>
+
+ <p>
+ Impala loads the shared library from an HDFS location. After building a shared library containing one or
+ more UDFs, use <codeph>hdfs dfs</codeph> or <codeph>hadoop fs</codeph> commands to copy the binary file to
+ an HDFS location readable by Impala.
+ </p>
+
+ <p>
+ The final step in deployment is to issue a <codeph>CREATE FUNCTION</codeph> statement in the
+ <cmdname>impala-shell</cmdname> interpreter to make Impala aware of the new function. See
+ <xref href="impala_create_function.xml#create_function"/> for syntax details. Because each function is
+ associated with a particular database, always issue a <codeph>USE</codeph> statement to the appropriate
+ database before creating a function, or specify a fully qualified name, that is, <codeph>CREATE FUNCTION
+ <varname>db_name</varname>.<varname>function_name</varname></codeph>.
+ </p>
+
+ <p>
+ As you update the UDF code and redeploy updated versions of a shared library, use <codeph>DROP
+ FUNCTION</codeph> and <codeph>CREATE FUNCTION</codeph> to let Impala pick up the latest version of the
+ code.
+ </p>
+
+ <note>
+ <p conref="../shared/impala_common.xml#common/udf_persistence_restriction"/>
+ </note>
+
+ <p>
+ Prerequisites for the build environment are:
+ </p>
+
+<codeblock># Use the appropriate package installation command for your Linux distribution.
+sudo yum install gcc-c++ cmake boost-devel
+sudo yum install impala-udf-devel</codeblock>
+
+ <p>
+ Then, unpack the sample code in <filepath>udf_samples.tar.gz</filepath> and use that as a template to set
+ up your build environment.
+ </p>
+
+ <p>
+ To build the original samples:
+ </p>
+
+<codeblock># Process CMakeLists.txt and set up appropriate Makefiles.
+cmake .
+# Generate shared libraries from UDF and UDAF sample code,
+# udf_samples/libudfsample.so and udf_samples/libudasample.so
+make</codeblock>
+
+ <p>
+ The sample code to examine, experiment with, and adapt is in these files:
+ </p>
+
+ <ul>
+ <li>
+ <filepath>udf-sample.h</filepath>: Header file that declares the signature for a scalar UDF
+ (<codeph>AddUDF</codeph>).
+ </li>
+
+ <li>
+ <filepath>udf-sample.cc</filepath>: Sample source for a simple UDF that adds two integers. Because
+ Impala can reference multiple function entry points from the same shared library, you could add other UDF
+ functions in this file and add their signatures to the corresponding header file.
+ </li>
+
+ <li>
+ <filepath>udf-sample-test.cc</filepath>: Basic unit tests for the sample UDF.
+ </li>
+
+ <li>
+ <filepath>uda-sample.h</filepath>: Header file that declares the signature for sample aggregate
+ functions. The SQL functions will be called <codeph>COUNT</codeph>, <codeph>AVG</codeph>, and
+ <codeph>STRINGCONCAT</codeph>. Because aggregate functions require more elaborate coding to handle the
+ processing for multiple phases, there are several underlying C++ functions such as
+ <codeph>CountInit</codeph>, <codeph>AvgUpdate</codeph>, and <codeph>StringConcatFinalize</codeph>.
+ </li>
+
+ <li>
+ <filepath>uda-sample.cc</filepath>: Sample source for simple UDAFs that demonstrate how to manage the
+ state transitions as the underlying functions are called during the different phases of query processing.
+ <ul>
+ <li>
+ The UDAF that imitates the <codeph>COUNT</codeph> function keeps track of a single incrementing
+ number; the merge functions combine the intermediate count values from each Impala node, and the
+ combined number is returned verbatim by the finalize function.
+ </li>
+
+ <li>
+ The UDAF that imitates the <codeph>AVG</codeph> function keeps track of two numbers, a count of rows
+ processed and the sum of values for a column. These numbers are updated and merged as with
+ <codeph>COUNT</codeph>, then the finalize function divides them to produce and return the final
+ average value.
+ </li>
+
+ <li>
+ The UDAF that concatenates string values into a comma-separated list demonstrates how to manage
+ storage for a string that increases in length as the function is called for multiple rows.
+ </li>
+ </ul>
+ </li>
+
+ <li>
+ <filepath>uda-sample-test.cc</filepath>: basic unit tests for the sample UDAFs.
+ </li>
+ </ul>
+ </conbody>
+ </concept>
+
+ <concept id="udf_performance">
+
+ <title>Performance Considerations for UDFs</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Performance"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ Because a UDF typically processes each row of a table, potentially being called billions of times, the
+ performance of each UDF is a critical factor in the speed of the overall ETL or ELT pipeline. Tiny
+ optimizations you can make within the function body can pay off in a big way when the function is called
+ over and over when processing a huge result set.
+ </p>
+ </conbody>
+ </concept>
+
+ <concept id="udf_tutorial">
+
+ <title>Examples of Creating and Using UDFs</title>
+
+ <conbody>
+
+ <p>
+ This section demonstrates how to create and use all kinds of user-defined functions (UDFs).
+ </p>
+
+ <p>
+ For downloadable examples that you can experiment with, adapt, and use as templates for your own functions,
+ see <xref href="https://github.com/cloudera/impala-udf-samples" scope="external" format="html">the Cloudera
+ sample UDF github</xref>. You must have already installed the appropriate header files, as explained in
+ <xref href="impala_udf.xml#udf_demo_env"/>.
+ </p>
+
+<!-- Limitation: mini-TOC currently doesn't include the <example> tags. -->
+
+<!-- <p outputclass="toc inpage"/> -->
+
+ <example id="udf_sample_udf">
+
+ <title>Sample C++ UDFs: HasVowels, CountVowels, StripVowels</title>
+
+ <p>
+ This example shows 3 separate UDFs that operate on strings and return different data types. In the C++
+ code, the functions are <codeph>HasVowels()</codeph> (checks if a string contains any vowels),
+ <codeph>CountVowels()</codeph> (returns the number of vowels in a string), and
+ <codeph>StripVowels()</codeph> (returns a new string with vowels removed).
+ </p>
+
+ <p>
+ First, we add the signatures for these functions to <filepath>udf-sample.h</filepath> in the demo build
+ environment:
+ </p>
+
+<codeblock>BooleanVal HasVowels(FunctionContext* context, const StringVal& input);
+IntVal CountVowels(FunctionContext* context, const StringVal& arg1);
+StringVal StripVowels(FunctionContext* context, const StringVal& arg1);</codeblock>
+
+ <p>
+ Then, we add the bodies of these functions to <filepath>udf-sample.cc</filepath>:
+ </p>
+
+<codeblock>BooleanVal HasVowels(FunctionContext* context, const StringVal& input)
+{
+ if (input.is_null) return BooleanVal::null();
+
+ int index;
+ uint8_t *ptr;
+
+ for (ptr = input.ptr, index = 0; index <= input.len; index++, ptr++)
+ {
+ uint8_t c = tolower(*ptr);
+ if (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u')
+ {
+ return BooleanVal(true);
+ }
+ }
+ return BooleanVal(false);
+}
+
+IntVal CountVowels(FunctionContext* context, const StringVal& arg1)
+{
+ if (arg1.is_null) return IntVal::null();
+
+ int count;
+ int index;
+ uint8_t *ptr;
+
+ for (ptr = arg1.ptr, count = 0, index = 0; index <= arg1.len; index++, ptr++)
+ {
+ uint8_t c = tolower(*ptr);
+ if (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u')
+ {
+ count++;
+ }
+ }
+ return IntVal(count);
+}
+
+StringVal StripVowels(FunctionContext* context, const StringVal& arg1)
+{
+ if (arg1.is_null) return StringVal::null();
+
+ int index;
+ std::string original((const char *)arg1.ptr,arg1.len);
+ std::string shorter("");
+
+ for (index = 0; index < original.length(); index++)
+ {
+ uint8_t c = original[index];
+ uint8_t l = tolower(c);
+
+ if (l == 'a' || l == 'e' || l == 'i' || l == 'o' || l == 'u')
+ {
+ ;
+ }
+ else
+ {
+ shorter.append(1, (char)c);
+ }
+ }
+// The modified string is stored in 'shorter', which is destroyed when this function ends. We need to make a string val
+// and copy the contents.
+ StringVal result(context, shorter.size()); // Only the version of the ctor that takes a context object allocates new memory
+ memcpy(result.ptr, shorter.c_str(), shorter.size());
+ return result;
+}</codeblock>
+
+ <p>
+ We build a shared library, <filepath>libudfsample.so</filepath>, and put the library file into HDFS
+ where Impala can read it:
+ </p>
+
+<codeblock>$ make
+[ 0%] Generating udf_samples/uda-sample.ll
+[ 16%] Built target uda-sample-ir
+[ 33%] Built target udasample
+[ 50%] Built target uda-sample-test
+[ 50%] Generating udf_samples/udf-sample.ll
+[ 66%] Built target udf-sample-ir
+Scanning dependencies of target udfsample
+[ 83%] Building CXX object CMakeFiles/udfsample.dir/udf-sample.o
+Linking CXX shared library udf_samples/libudfsample.so
+[ 83%] Built target udfsample
+Linking CXX executable udf_samples/udf-sample-test
+[100%] Built target udf-sample-test
+$ hdfs dfs -put ./udf_samples/libudfsample.so /user/hive/udfs/libudfsample.so</codeblock>
+
+ <p>
+ Finally, we go into the <cmdname>impala-shell</cmdname> interpreter where we set up some sample data,
+ issue <codeph>CREATE FUNCTION</codeph> statements to set up the SQL function names, and call the
+ functions in some queries:
+ </p>
+
+<codeblock>[localhost:21000] > create database udf_testing;
+[localhost:21000] > use udf_testing;
+
+[localhost:21000] > create function has_vowels (string) returns boolean location '/user/hive/udfs/libudfsample.so' symbol='HasVowels';
+[localhost:21000] > select has_vowels('abc');
++------------------------+
+| udfs.has_vowels('abc') |
++------------------------+
+| true |
++------------------------+
+Returned 1 row(s) in 0.13s
+[localhost:21000] > select has_vowels('zxcvbnm');
++----------------------------+
+| udfs.has_vowels('zxcvbnm') |
++----------------------------+
+| false |
++----------------------------+
+Returned 1 row(s) in 0.12s
+[localhost:21000] > select has_vowels(null);
++-----------------------+
+| udfs.has_vowels(null) |
++-----------------------+
+| NULL |
++-----------------------+
+Returned 1 row(s) in 0.11s
+[localhost:21000] > select s, has_vowels(s) from t2;
++-----------+--------------------+
+| s | udfs.has_vowels(s) |
++-----------+--------------------+
+| lower | true |
+| UPPER | true |
+| Init cap | true |
+| CamelCase | true |
++-----------+--------------------+
+Returned 4 row(s) in 0.24s
+
+[localhost:21000] > create function count_vowels (string) returns int location '/user/hive/udfs/libudfsample.so' symbol='CountVowels';
+[localhost:21000] > select count_vowels('cat in the hat');
++-------------------------------------+
+| udfs.count_vowels('cat in the hat') |
++-------------------------------------+
+| 4 |
++-------------------------------------+
+Returned 1 row(s) in 0.12s
+[localhost:21000] > select s, count_vowels(s) from t2;
++-----------+----------------------+
+| s | udfs.count_vowels(s) |
++-----------+----------------------+
+| lower | 2 |
+| UPPER | 2 |
+| Init cap | 3 |
+| CamelCase | 4 |
++-----------+----------------------+
+Returned 4 row(s) in 0.23s
+[localhost:21000] > select count_vowels(null);
++-------------------------+
+| udfs.count_vowels(null) |
++-------------------------+
+| NULL |
++-------------------------+
+Returned 1 row(s) in 0.12s
+
+[localhost:21000] > create function strip_vowels (string) returns string location '/user/hive/udfs/libudfsample.so' symbol='StripVowels';
+[localhost:21000] > select strip_vowels('abcdefg');
++------------------------------+
+| udfs.strip_vowels('abcdefg') |
++------------------------------+
+| bcdfg |
++------------------------------+
+Returned 1 row(s) in 0.11s
+[localhost:21000] > select strip_vowels('ABCDEFG');
++------------------------------+
+| udfs.strip_vowels('abcdefg') |
++------------------------------+
+| BCDFG |
++------------------------------+
+Returned 1 row(s) in 0.12s
+[localhost:21000] > select strip_vowels(null);
++-------------------------+
+| udfs.strip_vowels(null) |
++-------------------------+
+| NULL |
++-------------------------+
+Returned 1 row(s) in 0.16s
+[localhost:21000] > select s, strip_vowels(s) from t2;
++-----------+----------------------+
+| s | udfs.strip_vowels(s) |
++-----------+----------------------+
+| lower | lwr |
+| UPPER | PPR |
+| Init cap | nt cp |
+| CamelCase | CmlCs |
++-----------+----------------------+
+Returned 4 row(s) in 0.24s</codeblock>
+
+ </example>
+
+ <example id="udf_sample_uda">
+
+ <title>Sample C++ UDA: SumOfSquares</title>
+
+ <p>
+ This example demonstrates a user-defined aggregate function (UDA) that produces the sum of the squares of
+ its input values.
+ </p>
+
+ <p>
+ The coding for a UDA is a little more involved than a scalar UDF, because the processing is split into
+ several phases, each implemented by a different function. Each phase is relatively straightforward: the
+ <q>update</q> and <q>merge</q> phases, where most of the work is done, read an input value and combine it
+ with some accumulated intermediate value.
+ </p>
+
+ <p>
+ As in our sample UDF from the previous example, we add function signatures to a header file (in this
+ case, <filepath>uda-sample.h</filepath>). Because this is a math-oriented UDA, we make two versions of
+ each function, one accepting an integer value and the other accepting a floating-point value.
+ </p>
+
+<codeblock>void SumOfSquaresInit(FunctionContext* context, BigIntVal* val);
+void SumOfSquaresInit(FunctionContext* context, DoubleVal* val);
+
+void SumOfSquaresUpdate(FunctionContext* context, const BigIntVal& input, BigIntVal* val);
+void SumOfSquaresUpdate(FunctionContext* context, const DoubleVal& input, DoubleVal* val);
+
+void SumOfSquaresMerge(FunctionContext* context, const BigIntVal& src, BigIntVal* dst);
+void SumOfSquaresMerge(FunctionContext* context, const DoubleVal& src, DoubleVal* dst);
+
+BigIntVal SumOfSquaresFinalize(FunctionContext* context, const BigIntVal& val);
+DoubleVal SumOfSquaresFinalize(FunctionContext* context, const DoubleVal& val);</codeblock>
+
+ <p>
+ We add the function bodies to a C++ source file (in this case, <filepath>uda-sample.cc</filepath>):
+ </p>
+
+<codeblock>void SumOfSquaresInit(FunctionContext* context, BigIntVal* val) {
+ val->is_null = false;
+ val->val = 0;
+}
+void SumOfSquaresInit(FunctionContext* context, DoubleVal* val) {
+ val->is_null = false;
+ val->val = 0.0;
+}
+
+void SumOfSquaresUpdate(FunctionContext* context, const BigIntVal& input, BigIntVal* val) {
+ if (input.is_null) return;
+ val->val += input.val * input.val;
+}
+void SumOfSquaresUpdate(FunctionContext* context, const DoubleVal& input, DoubleVal* val) {
+ if (input.is_null) return;
+ val->val += input.val * input.val;
+}
+
+void SumOfSquaresMerge(FunctionContext* context, const BigIntVal& src, BigIntVal* dst) {
+ dst->val += src.val;
+}
+void SumOfSquaresMerge(FunctionContext* context, const DoubleVal& src, DoubleVal* dst) {
+ dst->val += src.val;
+}
+
+BigIntVal SumOfSquaresFinalize(FunctionContext* context, const BigIntVal& val) {
+ return val;
+}
+DoubleVal SumOfSquaresFinalize(FunctionContext* context, const DoubleVal& val) {
+ return val;
+}</codeblock>
+
+ <p>
+ As with the sample UDF, we build a shared library and put it into HDFS:
+ </p>
+
+<codeblock>$ make
+[ 0%] Generating udf_samples/uda-sample.ll
+[ 16%] Built target uda-sample-ir
+Scanning dependencies of target udasample
+[ 33%] Building CXX object CMakeFiles/udasample.dir/uda-sample.o
+Linking CXX shared library udf_samples/libudasample.so
+[ 33%] Built target udasample
+Scanning dependencies of target uda-sample-test
+[ 50%] Building CXX object CMakeFiles/uda-sample-test.dir/uda-sample-test.o
+Linking CXX executable udf_samples/uda-sample-test
+[ 50%] Built target uda-sample-test
+[ 50%] Generating udf_samples/udf-sample.ll
+[ 66%] Built target udf-sample-ir
+[ 83%] Built target udfsample
+[100%] Built target udf-sample-test
+$ hdfs dfs -put ./udf_samples/libudasample.so /user/hive/udfs/libudasample.so</codeblock>
+
+ <p>
+ To create the SQL function, we issue a <codeph>CREATE AGGREGATE FUNCTION</codeph> statement and specify
+ the underlying C++ function names for the different phases:
+ </p>
+
+<codeblock>[localhost:21000] > use udf_testing;
+
+[localhost:21000] > create table sos (x bigint, y double);
+[localhost:21000] > insert into sos values (1, 1.1), (2, 2.2), (3, 3.3), (4, 4.4);
+Inserted 4 rows in 1.10s
+
+[localhost:21000] > create aggregate function sum_of_squares(bigint) returns bigint
+ > location '/user/hive/udfs/libudasample.so'
+ > init_fn='SumOfSquaresInit'
+ > update_fn='SumOfSquaresUpdate'
+ > merge_fn='SumOfSquaresMerge'
+ > finalize_fn='SumOfSquaresFinalize';
+
+[localhost:21000] > -- Compute the same value using literals or the UDA;
+[localhost:21000] > select 1*1 + 2*2 + 3*3 + 4*4;
++-------------------------------+
+| 1 * 1 + 2 * 2 + 3 * 3 + 4 * 4 |
++-------------------------------+
+| 30 |
++-------------------------------+
+Returned 1 row(s) in 0.12s
+[localhost:21000] > select sum_of_squares(x) from sos;
++------------------------+
+| udfs.sum_of_squares(x) |
++------------------------+
+| 30 |
++------------------------+
+Returned 1 row(s) in 0.35s</codeblock>
+
+ <p>
+ Until we create the overloaded version of the UDA, it can only handle a single data type. To allow it to
+ handle <codeph>DOUBLE</codeph> as well as <codeph>BIGINT</codeph>, we issue another <codeph>CREATE
+ AGGREGATE FUNCTION</codeph> statement:
+ </p>
+
+<codeblock>[localhost:21000] > select sum_of_squares(y) from sos;
+ERROR: AnalysisException: No matching function with signature: udfs.sum_of_squares(DOUBLE).
+
+[localhost:21000] > create aggregate function sum_of_squares(double) returns double
+ > location '/user/hive/udfs/libudasample.so'
+ > init_fn='SumOfSquaresInit'
+ > update_fn='SumOfSquaresUpdate'
+ > merge_fn='SumOfSquaresMerge'
+ > finalize_fn='SumOfSquaresFinalize';
+
+[localhost:21000] > -- Compute the same value using literals or the UDA;
+[localhost:21000] > select 1.1*1.1 + 2.2*2.2 + 3.3*3.3 + 4.4*4.4;
++-----------------------------------------------+
+| 1.1 * 1.1 + 2.2 * 2.2 + 3.3 * 3.3 + 4.4 * 4.4 |
++-----------------------------------------------+
+| 36.3 |
++-----------------------------------------------+
+Returned 1 row(s) in 0.12s
+[localhost:21000] > select sum_of_squares(y) from sos;
++------------------------+
+| udfs.sum_of_squares(y) |
++------------------------+
+| 36.3 |
++------------------------+
+Returned 1 row(s) in 0.35s</codeblock>
+
+ <p>
+ Typically, you use a UDA in queries with <codeph>GROUP BY</codeph> clauses, to produce a result set with
+ a separate aggregate value for each combination of values from the <codeph>GROUP BY</codeph> clause.
+ Let's change our sample table to use <codeph>0</codeph> to indicate rows containing even values, and
+ <codeph>1</codeph> to flag rows containing odd values. Then the <codeph>GROUP BY</codeph> query can
+ return two values, the sum of the squares for the even values, and the sum of the squares for the odd
+ values:
+ </p>
+
+<codeblock>[localhost:21000] > insert overwrite sos values (1, 1), (2, 0), (3, 1), (4, 0);
+Inserted 4 rows in 1.24s
+
+[localhost:21000] > -- Compute 1 squared + 3 squared, and 2 squared + 4 squared;
+[localhost:21000] > select y, sum_of_squares(x) from sos group by y;
++---+------------------------+
+| y | udfs.sum_of_squares(x) |
++---+------------------------+
+| 1 | 10 |
+| 0 | 20 |
++---+------------------------+
+Returned 2 row(s) in 0.43s</codeblock>
+
+ </example>
+ </conbody>
+ </concept>
+
+ <concept id="udf_security">
+
+ <title>Security Considerations for User-Defined Functions</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Security"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ When the Impala authorization feature is enabled:
+ </p>
+
+ <ul>
+ <li>
+ To call a UDF in a query, you must have the required read privilege for any databases and tables used in
+ the query.
+ </li>
+
+ <li>
+ Because incorrectly coded UDFs could cause performance or capacity problems, for example by going into
+ infinite loops or allocating excessive amounts of memory, only an administrative user can create UDFs.
+ That is, to execute the <codeph>CREATE FUNCTION</codeph> statement requires the <codeph>ALL</codeph>
+ privilege on the server.
+ </li>
+ </ul>
+
+ <p>
+ See <xref href="impala_authorization.xml#authorization"/> for details about authorization in Impala.
+ </p>
+ </conbody>
+ </concept>
+
+ <concept id="udf_limits">
+
+ <title>Limitations and Restrictions for Impala UDFs</title>
+
+ <conbody>
+
+ <p>
+ The following limitations and restrictions apply to Impala UDFs in the current release:
+ </p>
+
+ <ul>
+ <li>
+ Impala does not support Hive UDFs that accept or return composite or nested types, or other types not
+ available in Impala tables.
+ </li>
+
+ <li>
+ All Impala UDFs must be deterministic, that is, produce the same output each time when passed the same
+ argument values. For example, an Impala UDF must not call functions such as <codeph>rand()</codeph> to
+ produce different values for each invocation. It must not retrieve data from external sources, such as
+ from disk or over the network.
+ </li>
+
+ <li>
+ An Impala UDF must not spawn other threads or processes.
+ </li>
+
+ <li>
+ When the <cmdname>catalogd</cmdname> process is restarted, all UDFs become undefined and must be
+ reloaded.
+ </li>
+
+ <li>
+ Impala currently does not support user-defined table functions (UDTFs).
+ </li>
+
+ <li rev="2.0.0">
+ The <codeph>CHAR</codeph> and <codeph>VARCHAR</codeph> types cannot be used as input arguments or return
+ values for UDFs.
+ </li>
+ </ul>
+ </conbody>
+ </concept>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_union.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_union.xml b/docs/topics/impala_union.xml
new file mode 100644
index 0000000..29a0b45
--- /dev/null
+++ b/docs/topics/impala_union.xml
@@ -0,0 +1,150 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="union">
+
+ <title>UNION Clause</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ The <codeph>UNION</codeph> clause lets you combine the result sets of multiple queries. By default, the
+ result sets are combined as if the <codeph>DISTINCT</codeph> operator was applied.
+<!--
+Because duplicate elimination can be a memory-intensive process, the more useful
+variation for most Impala queries is <codeph>UNION ALL</codeph>, which returns
+all results from both queries, even if there are duplicates.
+-->
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock><varname>query_1</varname> UNION [DISTINCT | ALL] <varname>query_2</varname></codeblock>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ The <codeph>UNION</codeph> keyword by itself is the same as <codeph>UNION DISTINCT</codeph>. Because
+ eliminating duplicates can be a memory-intensive process for a large result set, prefer <codeph>UNION
+ ALL</codeph> where practical. (That is, when you know the different queries in the union will not produce any
+ duplicates, or where the duplicate values are acceptable.)
+ </p>
+
+ <p rev="obwl">
+ When an <codeph>ORDER BY</codeph> clause applies to a <codeph>UNION ALL</codeph> or <codeph>UNION</codeph>
+ query, in Impala 1.4 and higher, the <codeph>LIMIT</codeph> clause is no longer required. To make the
+ <codeph>ORDER BY</codeph> and <codeph>LIMIT</codeph> clauses apply to the entire result set, turn the
+ <codeph>UNION</codeph> query into a subquery, <codeph>SELECT</codeph> from the subquery, and put the
+ <codeph>ORDER BY</codeph> clause at the end, outside the subquery.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ First, we set up some sample data, including duplicate <codeph>1</codeph> values.
+ </p>
+
+<codeblock rev="obwl">[localhost:21000] > create table few_ints (x int);
+[localhost:21000] > insert into few_ints values (1), (1), (2), (3);
+[localhost:21000] > set default_order_by_limit=1000;</codeblock>
+
+ <p>
+ This example shows how <codeph>UNION ALL</codeph> returns all rows from both queries, without any additional
+ filtering to eliminate duplicates. For the large result sets common with Impala queries, this is the most
+ memory-efficient technique.
+ </p>
+
+<codeblock>[localhost:21000] > select x from few_ints order by x;
++---+
+| x |
++---+
+| 1 |
+| 1 |
+| 2 |
+| 3 |
++---+
+Returned 4 row(s) in 0.41s
+[localhost:21000] > select x from few_ints union all select x from few_ints;
++---+
+| x |
++---+
+| 1 |
+| 1 |
+| 2 |
+| 3 |
+| 1 |
+| 1 |
+| 2 |
+| 3 |
++---+
+Returned 8 row(s) in 0.42s
+[localhost:21000] > select * from (select x from few_ints union all select x from few_ints) as t1 order by x;
++---+
+| x |
++---+
+| 1 |
+| 1 |
+| 1 |
+| 1 |
+| 2 |
+| 2 |
+| 3 |
+| 3 |
++---+
+Returned 8 row(s) in 0.53s
+[localhost:21000] > select x from few_ints union all select 10;
++----+
+| x |
++----+
+| 10 |
+| 1 |
+| 1 |
+| 2 |
+| 3 |
++----+
+Returned 5 row(s) in 0.38s</codeblock>
+
+ <p>
+ This example shows how the <codeph>UNION</codeph> clause without the <codeph>ALL</codeph> keyword condenses
+ the result set to eliminate all duplicate values, making the query take more time and potentially more
+ memory. The extra processing typically makes this technique not recommended for queries that return result
+ sets with millions or billions of values.
+ </p>
+
+<codeblock>[localhost:21000] > select x from few_ints union select x+1 from few_ints;
++---+
+| x |
++---+
+| 3 |
+| 4 |
+| 1 |
+| 2 |
++---+
+Returned 4 row(s) in 0.51s
+[localhost:21000] > select x from few_ints union select 10;
++----+
+| x |
++----+
+| 2 |
+| 10 |
+| 1 |
+| 3 |
++----+
+Returned 4 row(s) in 0.49s
+[localhost:21000] > select * from (select x from few_ints union select x from few_ints) as t1 order by x;
++---+
+| x |
++---+
+| 1 |
+| 2 |
+| 3 |
++---+
+Returned 3 row(s) in 0.53s</codeblock>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_update.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_update.xml b/docs/topics/impala_update.xml
new file mode 100644
index 0000000..3b9e330
--- /dev/null
+++ b/docs/topics/impala_update.xml
@@ -0,0 +1,64 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="update">
+
+ <title>UPDATE Statement (CDH 5.5 and higher only)</title>
+ <titlealts><navtitle>UPDATE</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Kudu"/>
+ <data name="Category" value="ETL"/>
+ <data name="Category" value="Ingest"/>
+ <data name="Category" value="DML"/>
+ <data name="Category" value="Data Analysts"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">UPDATE statement</indexterm>
+ Updates one or more rows from a Kudu table.
+ Although updating a single row or a range of rows would be inefficient for tables using HDFS
+ data files, Kudu is able to perform this operation efficiently. Therefore, this statement
+ only works for Impala tables that use the Kudu storage engine.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>
+</codeblock>
+
+ <p rev="kudu" audience="impala_next">
+ Normally, an <codeph>UPDATE</codeph> operation for a Kudu table fails if
+ some partition key columns are not found, due to their being deleted or changed
+ by a concurrent <codeph>UPDATE</codeph> or <codeph>DELETE</codeph> operation.
+ Specify <codeph>UPDATE IGNORE <varname>rest_of_statement</varname></codeph> to
+ make the <codeph>UPDATE</codeph> continue in this case. The rows with the nonexistent
+ duplicate partition key column values are not changed.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/dml_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/>
+
+ <note conref="../shared/impala_common.xml#common/compute_stats_next"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+<codeblock>
+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_kudu.xml#impala_kudu"/>
+ </p>
+
+ </conbody>
+
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_use.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_use.xml b/docs/topics/impala_use.xml
new file mode 100644
index 0000000..9e0b654
--- /dev/null
+++ b/docs/topics/impala_use.xml
@@ -0,0 +1,77 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="use">
+
+ <title>USE Statement</title>
+ <titlealts><navtitle>USE</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Databases"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">USE statement</indexterm>
+ Switches the current session to a specified database. The <term>current database</term> is where any
+ <codeph>CREATE TABLE</codeph>, <codeph>INSERT</codeph>, <codeph>SELECT</codeph>, or other statements act when
+ you specify a table or other object name, without prefixing it with a database name. The new current database
+ applies for the duration of the session or unti another <codeph>USE</codeph> statement is executed.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>USE <varname>db_name</varname></codeblock>
+
+ <p>
+ By default, when you connect to an Impala instance, you begin in a database named <codeph>default</codeph>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ Switching the default database is convenient in the following situations:
+ </p>
+
+ <ul>
+ <li>
+ To avoid qualifying each reference to a table with the database name. For example, <codeph>SELECT * FROM t1
+ JOIN t2</codeph> rather than <codeph>SELECT * FROM db.t1 JOIN db.t2</codeph>.
+ </li>
+
+ <li>
+ To do a sequence of operations all within the same database, such as creating a table, inserting data, and
+ querying the table.
+ </li>
+ </ul>
+
+ <p>
+ To start the <cmdname>impala-shell</cmdname> interpreter and automatically issue a <codeph>USE</codeph>
+ statement for a particular database, specify the option <codeph>-d <varname>db_name</varname></codeph> for
+ the <cmdname>impala-shell</cmdname> command. The <codeph>-d</codeph> option is useful to run SQL scripts,
+ such as setup or test scripts, against multiple databases without hardcoding a <codeph>USE</codeph> statement
+ into the SQL source.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ See <xref href="impala_create_database.xml#create_database"/> for examples covering <codeph>CREATE
+ DATABASE</codeph>, <codeph>USE</codeph>, and <codeph>DROP DATABASE</codeph>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_create_database.xml#create_database"/>,
+ <xref href="impala_drop_database.xml#drop_database"/>, <xref href="impala_show.xml#show_databases"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_v_cpu_cores.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_v_cpu_cores.xml b/docs/topics/impala_v_cpu_cores.xml
new file mode 100644
index 0000000..41be3af
--- /dev/null
+++ b/docs/topics/impala_v_cpu_cores.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.2" id="v_cpu_cores">
+
+ <title>V_CPU_CORES Query Option (CDH 5 only)</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Resource Management"/>
+ <data name="Category" value="YARN"/>
+ <data name="Category" value="Llama"/>
+ <data name="Category" value="Impala Query Options"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">V_CPU_CORES query option</indexterm>
+ The number of per-host virtual CPU cores to request from YARN. If set, the query option overrides the
+ automatic estimate from Impala.
+<!-- This sentence is used in a few places and could be conref'ed. -->
+ Used in conjunction with the Impala resource management feature in Impala 1.2 and higher and CDH 5.
+ </p>
+
+ <p>
+ <b>Type:</b> numeric
+ </p>
+
+ <p>
+ <b>Default:</b> 0 (use automatic estimates)
+ </p>
+
+<!-- Worth adding a couple of related info links here. -->
+
+ </conbody>
+</concept>
[13/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Posted by jr...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_decimal.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_decimal.xml b/docs/topics/impala_decimal.xml
new file mode 100644
index 0000000..c0c98d9
--- /dev/null
+++ b/docs/topics/impala_decimal.xml
@@ -0,0 +1,836 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.4.0" id="decimal">
+
+ <title>DECIMAL Data Type (CDH 5.1 or higher only)</title>
+ <titlealts><navtitle>DECIMAL (CDH 5.1 or higher only)</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Data Types"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ A numeric data type with fixed scale and precision, used in <codeph>CREATE TABLE</codeph> and <codeph>ALTER
+ TABLE</codeph> statements. Suitable for financial and other arithmetic calculations where the imprecise
+ representation and rounding behavior of <codeph>FLOAT</codeph> and <codeph>DOUBLE</codeph> make those types
+ impractical.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+ <p>
+ In the column definition of a <codeph>CREATE TABLE</codeph> statement:
+ </p>
+
+<codeblock><varname>column_name</varname> DECIMAL[(<varname>precision</varname>[,<varname>scale</varname>])]</codeblock>
+
+ <p>
+ <codeph>DECIMAL</codeph> with no precision or scale values is equivalent to <codeph>DECIMAL(9,0)</codeph>.
+ </p>
+
+ <p>
+ <b>Precision and Scale:</b>
+ </p>
+
+ <p>
+ <varname>precision</varname> represents the total number of digits that can be represented by the column,
+ regardless of the location of the decimal point. This value must be between 1 and 38. For example,
+ representing integer values up to 9999, and floating-point values up to 99.99, both require a precision of 4.
+ You can also represent corresponding negative values, without any change in the precision. For example, the
+ range -9999 to 9999 still only requires a precision of 4.
+ </p>
+
+ <p>
+ <varname>scale</varname> represents the number of fractional digits. This value must be less than or equal to
+ <varname>precision</varname>. A scale of 0 produces integral values, with no fractional part. If precision
+ and scale are equal, all the digits come after the decimal point, making all the values between 0 and
+ 0.999... or 0 and -0.999...
+ </p>
+
+ <p>
+ When <varname>precision</varname> and <varname>scale</varname> are omitted, a <codeph>DECIMAL</codeph> value
+ is treated as <codeph>DECIMAL(9,0)</codeph>, that is, an integer value ranging from
+ <codeph>-999,999,999</codeph> to <codeph>999,999,999</codeph>. This is the largest <codeph>DECIMAL</codeph>
+ value that can still be represented in 4 bytes. If precision is specified but scale is omitted, Impala uses a
+ value of zero for the scale.
+ </p>
+
+ <p>
+ Both <varname>precision</varname> and <varname>scale</varname> must be specified as integer literals, not any
+ other kind of constant expressions.
+ </p>
+
+ <p>
+ To check the precision or scale for arbitrary values, you can call the
+ <xref href="impala_math_functions.xml#math_functions"><codeph>precision()</codeph> and
+ <codeph>scale()</codeph> built-in functions</xref>. For example, you might use these values to figure out how
+ many characters are required for various fields in a report, or to understand the rounding characteristics of
+ a formula as applied to a particular <codeph>DECIMAL</codeph> column.
+ </p>
+
+ <p>
+ <b>Range:</b>
+ </p>
+
+ <p>
+ The maximum precision value is 38. Thus, the largest integral value is represented by
+ <codeph>DECIMAL(38,0)</codeph> (999... with 9 repeated 38 times). The most precise fractional value (between
+ 0 and 1, or 0 and -1) is represented by <codeph>DECIMAL(38,38)</codeph>, with 38 digits to the right of the
+ decimal point. The value closest to 0 would be .0000...1 (37 zeros and the final 1). The value closest to 1
+ would be .999... (9 repeated 38 times).
+ </p>
+
+ <p>
+ For a given precision and scale, the range of <codeph>DECIMAL</codeph> values is the same in the positive and
+ negative directions. For example, <codeph>DECIMAL(4,2)</codeph> can represent from -99.99 to 99.99. This is
+ different from other integral numeric types where the positive and negative bounds differ slightly.
+ </p>
+
+ <p>
+ When you use <codeph>DECIMAL</codeph> values in arithmetic expressions, the precision and scale of the result
+ value are determined as follows:
+ </p>
+
+ <ul>
+ <li>
+ <p>
+ For addition and subtraction, the precision and scale are based on the maximum possible result, that is,
+ if all the digits of the input values were 9s and the absolute values were added together.
+ </p>
+<!-- Seems like buggy output from this first query, so hiding the example for the time being. -->
+<codeblock audience="Cloudera">[localhost:21000] > select 50000.5 + 12.444, precision(50000.5 + 12.444), scale(50000.5 + 12.444);
++------------------+-----------------------------+-------------------------+
+| 50000.5 + 12.444 | precision(50000.5 + 12.444) | scale(50000.5 + 12.444) |
++------------------+-----------------------------+-------------------------+
+| 50012.944 | 9 | 3 |
++------------------+-----------------------------+-------------------------+
+[localhost:21000] > select 99999.9 + 99.999, precision(99999.9 + 99.999), scale(99999.9 + 99.999);
++------------------+-----------------------------+-------------------------+
+| 99999.9 + 99.999 | precision(99999.9 + 99.999) | scale(99999.9 + 99.999) |
++------------------+-----------------------------+-------------------------+
+| 100099.899 | 9 | 3 |
++------------------+-----------------------------+-------------------------+
+</codeblock>
+ </li>
+
+ <li>
+ <p>
+ For multiplication, the precision is the sum of the precisions of the input values. The scale is the sum
+ of the scales of the input values.
+ </p>
+ </li>
+
+<!-- Need to add some specifics to discussion of division. Details here: http://blogs.msdn.com/b/sqlprogrammability/archive/2006/03/29/564110.aspx -->
+
+ <li>
+ <p>
+ For division, Impala sets the precision and scale to values large enough to represent the whole and
+ fractional parts of the result.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ For <codeph>UNION</codeph>, the scale is the larger of the scales of the input values, and the precision
+ is increased if necessary to accommodate any additional fractional digits. If the same input value has
+ the largest precision and the largest scale, the result value has the same precision and scale. If one
+ value has a larger precision but smaller scale, the scale of the result value is increased. For example,
+ <codeph>DECIMAL(20,2) UNION DECIMAL(8,6)</codeph> produces a result of type
+ <codeph>DECIMAL(24,6)</codeph>. The extra 4 fractional digits of scale (6-2) are accommodated by
+ extending the precision by the same amount (20+4).
+ </p>
+ </li>
+
+ <li>
+ <p>
+ To doublecheck, you can always call the <codeph>PRECISION()</codeph> and <codeph>SCALE()</codeph>
+ functions on the results of an arithmetic expression to see the relevant values, or use a <codeph>CREATE
+ TABLE AS SELECT</codeph> statement to define a column based on the return type of the expression.
+ </p>
+ </li>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/compatibility_blurb"/>
+
+ <ul>
+ <li>
+ Using the <codeph>DECIMAL</codeph> type is only supported under CDH 5.1.0 and higher.
+<!--
+ Although Impala-created tables containing <codeph>DECIMAL</codeph> columns are
+ readable in CDH 5.1, <codeph>DECIMAL</codeph> data is not interoperable with
+ other Hadoop components in CDH 4, and some Impala operations such as
+ <codeph>COMPUTE STATS</codeph> are not possible on such tables in CDH 4.
+ If you create a Parquet table with a <codeph>DECIMAL</codeph>
+ column under CDH 4, Impala issues a warning because the data files might not be readable from other CDH 4 components.
+-->
+ </li>
+
+<!--
+ <li>
+ The <codeph>DECIMAL</codeph> data type is a relatively new addition to the
+ Parquet file format. To read Impala-created Parquet files containing
+ <codeph>DECIMAL</codeph> columns from another Hadoop component such as
+ MapReduce, Pig, or Hive, use CDH 5.1 or higher, or the equivalent levels of the relevant components and Parquet
+ JARs from CDH 5.1.
+ If you create a Parquet table with a <codeph>DECIMAL</codeph>
+ column under CDH 4, Impala issues a warning because the data files might not be readable from other CDH 4 components.
+ </li>
+
+ <li>
+ In particular, Impala-created tables with <codeph>DECIMAL</codeph> columns are
+ not readable by Hive under CDH 4.
+ </li>
+-->
+
+ <li>
+ Use the <codeph>DECIMAL</codeph> data type in Impala for applications where you used the
+ <codeph>NUMBER</codeph> data type in Oracle. The Impala <codeph>DECIMAL</codeph> type does not support the
+ Oracle idioms of <codeph>*</codeph> for scale or negative values for precision.
+ </li>
+ </ul>
+
+ <p>
+ <b>Conversions and casting:</b>
+ </p>
+
+ <p>
+ <ph conref="../shared/impala_common.xml#common/cast_int_to_timestamp"/>
+ </p>
+
+ <p>
+ Impala automatically converts between <codeph>DECIMAL</codeph> and other numeric types where possible. A
+ <codeph>DECIMAL</codeph> with zero scale is converted to or from the smallest appropriate integral type. A
+ <codeph>DECIMAL</codeph> with a fractional part is automatically converted to or from the smallest
+ appropriate floating-point type. If the destination type does not have sufficient precision or scale to hold
+ all possible values of the source type, Impala raises an error and does not convert the value.
+ </p>
+
+ <p>
+ For example, these statements show how expressions of <codeph>DECIMAL</codeph> and other types are reconciled
+ to the same type in the context of <codeph>UNION</codeph> queries and <codeph>INSERT</codeph> statements:
+ </p>
+
+<codeblock>[localhost:21000] > select cast(1 as int) as x union select cast(1.5 as decimal(9,4)) as x;
++----------------+
+| x |
++----------------+
+| 1.5000 |
+| 1.0000 |
++----------------+
+[localhost:21000] > create table int_vs_decimal as select cast(1 as int) as x union select cast(1.5 as decimal(9,4)) as x;
++-------------------+
+| summary |
++-------------------+
+| Inserted 2 row(s) |
++-------------------+
+[localhost:21000] > desc int_vs_decimal;
++------+---------------+---------+
+| name | type | comment |
++------+---------------+---------+
+| x | decimal(14,4) | |
++------+---------------+---------+
+</codeblock>
+
+ <p>
+ To avoid potential conversion errors, you can use <codeph>CAST()</codeph> to convert <codeph>DECIMAL</codeph>
+ values to <codeph>FLOAT</codeph>, <codeph>TINYINT</codeph>, <codeph>SMALLINT</codeph>, <codeph>INT</codeph>,
+ <codeph>BIGINT</codeph>, <codeph>STRING</codeph>, <codeph>TIMESTAMP</codeph>, or <codeph>BOOLEAN</codeph>.
+ You can use exponential notation in <codeph>DECIMAL</codeph> literals or when casting from
+ <codeph>STRING</codeph>, for example <codeph>1.0e6</codeph> to represent one million.
+ </p>
+
+ <p>
+ If you cast a value with more fractional digits than the scale of the destination type, any extra fractional
+ digits are truncated (not rounded). Casting a value to a target type with not enough precision produces a
+ result of <codeph>NULL</codeph> and displays a runtime warning.
+ </p>
+
+<codeblock>[localhost:21000] > select cast(1.239 as decimal(3,2));
++-----------------------------+
+| cast(1.239 as decimal(3,2)) |
++-----------------------------+
+| 1.23 |
++-----------------------------+
+[localhost:21000] > select cast(1234 as decimal(3));
++----------------------------+
+| cast(1234 as decimal(3,0)) |
++----------------------------+
+| NULL |
++----------------------------+
+WARNINGS: Expression overflowed, returning NULL
+</codeblock>
+
+ <p>
+ When you specify integer literals, for example in <codeph>INSERT ... VALUES</codeph> statements or arithmetic
+ expressions, those numbers are interpreted as the smallest applicable integer type. You must use
+ <codeph>CAST()</codeph> calls for some combinations of integer literals and <codeph>DECIMAL</codeph>
+ precision. For example, <codeph>INT</codeph> has a maximum value that is 10 digits long,
+ <codeph>TINYINT</codeph> has a maximum value that is 3 digits long, and so on. If you specify a value such as
+ 123456 to go into a <codeph>DECIMAL</codeph> column, Impala checks if the column has enough precision to
+ represent the largest value of that integer type, and raises an error if not. Therefore, use an expression
+ like <codeph>CAST(123456 TO DECIMAL(9,0))</codeph> for <codeph>DECIMAL</codeph> columns with precision 9 or
+ less, <codeph>CAST(50 TO DECIMAL(2,0))</codeph> for <codeph>DECIMAL</codeph> columns with precision 2 or
+ less, and so on. For <codeph>DECIMAL</codeph> columns with precision 10 or greater, Impala automatically
+ interprets the value as the correct <codeph>DECIMAL</codeph> type; however, because
+ <codeph>DECIMAL(10)</codeph> requires 8 bytes of storage while <codeph>DECIMAL(9)</codeph> requires only 4
+ bytes, only use precision of 10 or higher when actually needed.
+ </p>
+
+<codeblock>[localhost:21000] > create table decimals_9_0 (x decimal);
+[localhost:21000] > insert into decimals_9_0 values (1), (2), (4), (8), (16), (1024), (32768), (65536), (1000000);
+ERROR: AnalysisException: Possible loss of precision for target table 'decimal_testing.decimals_9_0'.
+Expression '1' (type: INT) would need to be cast to DECIMAL(9,0) for column 'x'
+[localhost:21000] > insert into decimals_9_0 values (cast(1 as decimal)), (cast(2 as decimal)), (cast(4 as decimal)), (cast(8 as decimal)), (cast(16 as decimal)), (cast(1024 as decimal)), (cast(32768 as decimal)), (cast(65536 as decimal)), (cast(1000000 as decimal));
+
+[localhost:21000] > create table decimals_10_0 (x decimal(10,0));
+[localhost:21000] > insert into decimals_10_0 values (1), (2), (4), (8), (16), (1024), (32768), (65536), (1000000);
+[localhost:21000] >
+</codeblock>
+
+ <p>
+ Be aware that in memory and for binary file formats such as Parquet or Avro, <codeph>DECIMAL(10)</codeph> or
+ higher consumes 8 bytes while <codeph>DECIMAL(9)</codeph> (the default for <codeph>DECIMAL</codeph>) or lower
+ consumes 4 bytes. Therefore, to conserve space in large tables, use the smallest-precision
+ <codeph>DECIMAL</codeph> type that is appropriate and <codeph>CAST()</codeph> literal values where necessary,
+ rather than declaring <codeph>DECIMAL</codeph> columns with high precision for convenience.
+ </p>
+
+ <p>
+ To represent a very large or precise <codeph>DECIMAL</codeph> value as a literal, for example one that
+ contains more digits than can be represented by a <codeph>BIGINT</codeph> literal, use a quoted string or a
+ floating-point value for the number, and <codeph>CAST()</codeph> to the desired <codeph>DECIMAL</codeph>
+ type:
+ </p>
+
+<codeblock>insert into decimals_38_5 values (1), (2), (4), (8), (16), (1024), (32768), (65536), (1000000),
+ (cast("999999999999999999999999999999" as decimal(38,5))),
+ (cast(999999999999999999999999999999. as decimal(38,5)));
+</codeblock>
+
+ <ul>
+ <li>
+ <p>
+ The result of an aggregate function such as <codeph>MAX()</codeph>, <codeph>SUM()</codeph>, or
+ <codeph>AVG()</codeph> on <codeph>DECIMAL</codeph> values is promoted to a scale of 38, with the same
+ precision as the underlying column. Thus, the result can represent the largest possible value at that
+ particular precision.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ <codeph>STRING</codeph> columns, literals, or expressions can be converted to <codeph>DECIMAL</codeph> as
+ long as the overall number of digits and digits to the right of the decimal point fit within the
+ specified precision and scale for the declared <codeph>DECIMAL</codeph> type. By default, a
+ <codeph>DECIMAL</codeph> value with no specified scale or precision can hold a maximum of 9 digits of an
+ integer value. If there are more digits in the string value than are allowed by the
+ <codeph>DECIMAL</codeph> scale and precision, the result is <codeph>NULL</codeph>.
+ </p>
+ <p>
+ The following examples demonstrate how <codeph>STRING</codeph> values with integer and fractional parts
+ are represented when converted to <codeph>DECIMAL</codeph>. If the scale is 0, the number is treated
+ as an integer value with a maximum of <varname>precision</varname> digits. If the precision is greater than
+ 0, the scale must be increased to account for the digits both to the left and right of the decimal point.
+ As the precision increases, output values are printed with additional trailing zeros after the decimal
+ point if needed. Any trailing zeros after the decimal point in the <codeph>STRING</codeph> value must fit
+ within the number of digits specified by the precision.
+ </p>
+<codeblock>[localhost:21000] > select cast('100' as decimal); -- Small integer value fits within 9 digits of scale.
++-----------------------------+
+| cast('100' as decimal(9,0)) |
++-----------------------------+
+| 100 |
++-----------------------------+
+[localhost:21000] > select cast('100' as decimal(3,0)); -- Small integer value fits within 3 digits of scale.
++-----------------------------+
+| cast('100' as decimal(3,0)) |
++-----------------------------+
+| 100 |
++-----------------------------+
+[localhost:21000] > select cast('100' as decimal(2,0)); -- 2 digits of scale is not enough!
++-----------------------------+
+| cast('100' as decimal(2,0)) |
++-----------------------------+
+| NULL |
++-----------------------------+
+[localhost:21000] > select cast('100' as decimal(3,1)); -- (3,1) = 2 digits left of the decimal point, 1 to the right. Not enough.
++-----------------------------+
+| cast('100' as decimal(3,1)) |
++-----------------------------+
+| NULL |
++-----------------------------+
+[localhost:21000] > select cast('100' as decimal(4,1)); -- 4 digits total, 1 to the right of the decimal point.
++-----------------------------+
+| cast('100' as decimal(4,1)) |
++-----------------------------+
+| 100.0 |
++-----------------------------+
+[localhost:21000] > select cast('98.6' as decimal(3,1)); -- (3,1) can hold a 3 digit number with 1 fractional digit.
++------------------------------+
+| cast('98.6' as decimal(3,1)) |
++------------------------------+
+| 98.6 |
++------------------------------+
+[localhost:21000] > select cast('98.6' as decimal(15,1)); -- Larger scale allows bigger numbers but still only 1 fractional digit.
++-------------------------------+
+| cast('98.6' as decimal(15,1)) |
++-------------------------------+
+| 98.6 |
++-------------------------------+
+[localhost:21000] > select cast('98.6' as decimal(15,5)); -- Larger precision allows more fractional digits, outputs trailing zeros.
++-------------------------------+
+| cast('98.6' as decimal(15,5)) |
++-------------------------------+
+| 98.60000 |
++-------------------------------+
+[localhost:21000] > select cast('98.60000' as decimal(15,1)); -- Trailing zeros in the string must fit within 'scale' digits (1 in this case).
++-----------------------------------+
+| cast('98.60000' as decimal(15,1)) |
++-----------------------------------+
+| NULL |
++-----------------------------------+
+</codeblock>
+ </li>
+
+ <li>
+ Most built-in arithmetic functions such as <codeph>SIN()</codeph> and <codeph>COS()</codeph> continue to
+ accept only <codeph>DOUBLE</codeph> values because they are so commonly used in scientific context for
+ calculations of IEEE 954-compliant values. The built-in functions that accept and return
+ <codeph>DECIMAL</codeph> are:
+<!-- List from Skye: positive, negative, least, greatest, fnv_hash, if, nullif, zeroifnull, isnull, coalesce -->
+<!-- Nong had already told me about abs, ceil, floor, round, truncate -->
+ <ul>
+ <li>
+ <codeph>ABS()</codeph>
+ </li>
+
+ <li>
+ <codeph>CEIL()</codeph>
+ </li>
+
+ <li>
+ <codeph>COALESCE()</codeph>
+ </li>
+
+ <li>
+ <codeph>FLOOR()</codeph>
+ </li>
+
+ <li>
+ <codeph>FNV_HASH()</codeph>
+ </li>
+
+ <li>
+ <codeph>GREATEST()</codeph>
+ </li>
+
+ <li>
+ <codeph>IF()</codeph>
+ </li>
+
+ <li>
+ <codeph>ISNULL()</codeph>
+ </li>
+
+ <li>
+ <codeph>LEAST()</codeph>
+ </li>
+
+ <li>
+ <codeph>NEGATIVE()</codeph>
+ </li>
+
+ <li>
+ <codeph>NULLIF()</codeph>
+ </li>
+
+ <li>
+ <codeph>POSITIVE()</codeph>
+ </li>
+
+ <li>
+ <codeph>PRECISION()</codeph>
+ </li>
+
+ <li>
+ <codeph>ROUND()</codeph>
+ </li>
+
+ <li>
+ <codeph>SCALE()</codeph>
+ </li>
+
+ <li>
+ <codeph>TRUNCATE()</codeph>
+ </li>
+
+ <li>
+ <codeph>ZEROIFNULL()</codeph>
+ </li>
+ </ul>
+ See <xref href="impala_functions.xml#builtins"/> for details.
+ </li>
+
+ <li>
+ <p>
+ <codeph>BIGINT</codeph>, <codeph>INT</codeph>, <codeph>SMALLINT</codeph>, and <codeph>TINYINT</codeph>
+ values can all be cast to <codeph>DECIMAL</codeph>. The number of digits to the left of the decimal point
+ in the <codeph>DECIMAL</codeph> type must be sufficient to hold the largest value of the corresponding
+ integer type. Note that integer literals are treated as the smallest appropriate integer type, meaning
+ there is sometimes a range of values that require one more digit of <codeph>DECIMAL</codeph> scale than
+ you might expect. For integer values, the precision of the <codeph>DECIMAL</codeph> type can be zero; if
+ the precision is greater than zero, remember to increase the scale value by an equivalent amount to hold
+ the required number of digits to the left of the decimal point.
+ </p>
+ <p>
+ The following examples show how different integer types are converted to <codeph>DECIMAL</codeph>.
+ </p>
+<!-- According to Nong, it's a bug that so many integer digits can be converted to a DECIMAL
+ value with small (s,p) spec. So expect to re-do this example. -->
+<codeblock>[localhost:21000] > select cast(1 as decimal(1,0));
++-------------------------+
+| cast(1 as decimal(1,0)) |
++-------------------------+
+| 1 |
++-------------------------+
+[localhost:21000] > select cast(9 as decimal(1,0));
++-------------------------+
+| cast(9 as decimal(1,0)) |
++-------------------------+
+| 9 |
++-------------------------+
+[localhost:21000] > select cast(10 as decimal(1,0));
++--------------------------+
+| cast(10 as decimal(1,0)) |
++--------------------------+
+| 10 |
++--------------------------+
+[localhost:21000] > select cast(10 as decimal(1,1));
++--------------------------+
+| cast(10 as decimal(1,1)) |
++--------------------------+
+| 10.0 |
++--------------------------+
+[localhost:21000] > select cast(100 as decimal(1,1));
++---------------------------+
+| cast(100 as decimal(1,1)) |
++---------------------------+
+| 100.0 |
++---------------------------+
+[localhost:21000] > select cast(1000 as decimal(1,1));
++----------------------------+
+| cast(1000 as decimal(1,1)) |
++----------------------------+
+| 1000.0 |
++----------------------------+
+</codeblock>
+ </li>
+
+ <li>
+ <p>
+ When a <codeph>DECIMAL</codeph> value is converted to any of the integer types, any fractional part is
+ truncated (that is, rounded towards zero):
+ </p>
+<codeblock>[localhost:21000] > create table num_dec_days (x decimal(4,1));
+[localhost:21000] > insert into num_dec_days values (1), (2), (cast(4.5 as decimal(4,1)));
+[localhost:21000] > insert into num_dec_days values (cast(0.1 as decimal(4,1))), (cast(.9 as decimal(4,1))), (cast(9.1 as decimal(4,1))), (cast(9.9 as decimal(4,1)));
+[localhost:21000] > select cast(x as int) from num_dec_days;
++----------------+
+| cast(x as int) |
++----------------+
+| 1 |
+| 2 |
+| 4 |
+| 0 |
+| 0 |
+| 9 |
+| 9 |
++----------------+
+</codeblock>
+ </li>
+
+ <li>
+ <p>
+ You cannot directly cast <codeph>TIMESTAMP</codeph> or <codeph>BOOLEAN</codeph> values to or from
+ <codeph>DECIMAL</codeph> values. You can turn a <codeph>DECIMAL</codeph> value into a time-related
+ representation using a two-step process, by converting it to an integer value and then using that result
+ in a call to a date and time function such as <codeph>from_unixtime()</codeph>.
+ </p>
+<codeblock>[localhost:21000] > select from_unixtime(cast(cast(1000.0 as decimal) as bigint));
++-------------------------------------------------------------+
+| from_unixtime(cast(cast(1000.0 as decimal(9,0)) as bigint)) |
++-------------------------------------------------------------+
+| 1970-01-01 00:16:40 |
++-------------------------------------------------------------+
+[localhost:21000] > select now() + interval cast(x as int) days from num_dec_days; -- x is a DECIMAL column.
+
+[localhost:21000] > create table num_dec_days (x decimal(4,1));
+[localhost:21000] > insert into num_dec_days values (1), (2), (cast(4.5 as decimal(4,1)));
+[localhost:21000] > select now() + interval cast(x as int) days from num_dec_days; -- The 4.5 value is truncated to 4 and becomes '4 days'.
++--------------------------------------+
+| now() + interval cast(x as int) days |
++--------------------------------------+
+| 2014-05-13 23:11:55.163284000 |
+| 2014-05-14 23:11:55.163284000 |
+| 2014-05-16 23:11:55.163284000 |
++--------------------------------------+
+</codeblock>
+ </li>
+
+ <li>
+ <p>
+ Because values in <codeph>INSERT</codeph> statements are checked rigorously for type compatibility, be
+ prepared to use <codeph>CAST()</codeph> function calls around literals, column references, or other
+ expressions that you are inserting into a <codeph>DECIMAL</codeph> column.
+ </p>
+ </li>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/null_bad_numeric_cast"/>
+
+ <p>
+ <b>DECIMAL differences from integer and floating-point types:</b>
+ </p>
+
+ <p>
+ With the <codeph>DECIMAL</codeph> type, you are concerned with the number of overall digits of a number
+ rather than powers of 2 (as in <codeph>TINYINT</codeph>, <codeph>SMALLINT</codeph>, and so on). Therefore,
+ the limits with integral values of <codeph>DECIMAL</codeph> types fall around 99, 999, 9999, and so on rather
+ than 32767, 65535, 2
+ <sup>32</sup>
+ -1, and so on. For fractional values, you do not need to account for imprecise representation of the
+ fractional part according to the IEEE-954 standard (as in <codeph>FLOAT</codeph> and
+ <codeph>DOUBLE</codeph>). Therefore, when you insert a fractional value into a <codeph>DECIMAL</codeph>
+ column, you can compare, sum, query, <codeph>GROUP BY</codeph>, and so on that column and get back the
+ original values rather than some <q>close but not identical</q> value.
+ </p>
+
+ <p>
+ <codeph>FLOAT</codeph> and <codeph>DOUBLE</codeph> can cause problems or unexpected behavior due to inability
+ to precisely represent certain fractional values, for example dollar and cents values for currency. You might
+ find output values slightly different than you inserted, equality tests that do not match precisely, or
+ unexpected values for <codeph>GROUP BY</codeph> columns. <codeph>DECIMAL</codeph> can help reduce unexpected
+ behavior and rounding errors, at the expense of some performance overhead for assignments and comparisons.
+ </p>
+
+ <p>
+ <b>Literals and expressions:</b>
+ <ul>
+ <li>
+ <p>
+ When you use an integer literal such as <codeph>1</codeph> or <codeph>999</codeph> in a SQL statement,
+ depending on the context, Impala will treat it as either the smallest appropriate
+ <codeph>DECIMAL</codeph> type, or the smallest integer type (<codeph>TINYINT</codeph>,
+ <codeph>SMALLINT</codeph>, <codeph>INT</codeph>, or <codeph>BIGINT</codeph>). To minimize memory usage,
+ Impala prefers to treat the literal as the smallest appropriate integer type.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ When you use a floating-point literal such as <codeph>1.1</codeph> or <codeph>999.44</codeph> in a SQL
+ statement, depending on the context, Impala will treat it as either the smallest appropriate
+ <codeph>DECIMAL</codeph> type, or the smallest floating-point type (<codeph>FLOAT</codeph> or
+ <codeph>DOUBLE</codeph>). To avoid loss of accuracy, Impala prefers to treat the literal as a
+ <codeph>DECIMAL</codeph>.
+ </p>
+ </li>
+ </ul>
+ </p>
+
+ <p>
+ <b>Storage considerations:</b>
+ </p>
+
+ <ul>
+ <li>
+ Only the precision determines the storage size for <codeph>DECIMAL</codeph> values; the scale setting has
+ no effect on the storage size.
+ </li>
+
+ <li>
+ Text, RCFile, and SequenceFile tables all use ASCII-based formats. In these text-based file formats,
+ leading zeros are not stored, but trailing zeros are stored. In these tables, each <codeph>DECIMAL</codeph>
+ value takes up as many bytes as there are digits in the value, plus an extra byte if the decimal point is
+ present and an extra byte for negative values. Once the values are loaded into memory, they are represented
+ in 4, 8, or 16 bytes as described in the following list items. The on-disk representation varies depending
+ on the file format of the table.
+ </li>
+
+<!-- Next couple of points can be conref'ed with identical list bullets farther down under File Format Considerations. -->
+
+ <li>
+ Parquet and Avro tables use binary formats, In these tables, Impala stores each value in as few bytes as
+ possible
+<!-- 4, 8, or 16 bytes -->
+ depending on the precision specified for the <codeph>DECIMAL</codeph> column.
+ <ul>
+ <li>
+ In memory, <codeph>DECIMAL</codeph> values with precision of 9 or less are stored in 4 bytes.
+ </li>
+
+ <li>
+ In memory, <codeph>DECIMAL</codeph> values with precision of 10 through 18 are stored in 8 bytes.
+ </li>
+
+ <li>
+ In memory, <codeph>DECIMAL</codeph> values with precision greater than 18 are stored in 16 bytes.
+ </li>
+ </ul>
+ </li>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/file_format_blurb"/>
+
+ <ul>
+ <li>
+ The <codeph>DECIMAL</codeph> data type can be stored in any of the file formats supported by Impala, as
+ described in <xref href="impala_file_formats.xml#file_formats"/>. Impala only writes to tables that use the
+ Parquet and text formats, so those formats are the focus for file format compatibility.
+ </li>
+
+ <li>
+ Impala can query Avro, RCFile, or SequenceFile tables containing <codeph>DECIMAL</codeph> columns, created
+ by other Hadoop components, on CDH 5 only.
+ </li>
+
+ <li>
+ You can use <codeph>DECIMAL</codeph> columns in Impala tables that are mapped to HBase tables. Impala can
+ query and insert into such tables.
+ </li>
+
+ <li>
+ Text, RCFile, and SequenceFile tables all use ASCII-based formats. In these tables, each
+ <codeph>DECIMAL</codeph> value takes up as many bytes as there are digits in the value, plus an extra byte
+ if the decimal point is present. The binary format of Parquet or Avro files offers more compact storage for
+ <codeph>DECIMAL</codeph> columns.
+ </li>
+
+ <li>
+ Parquet and Avro tables use binary formats, In these tables, Impala stores each value in 4, 8, or 16 bytes
+ depending on the precision specified for the <codeph>DECIMAL</codeph> column.
+ </li>
+
+ <li>
+ Parquet files containing <codeph>DECIMAL</codeph> columns are not expected to be readable under CDH 4. See
+ the <b>Compatibility</b> section for details.
+ </li>
+ </ul>
+
+ <p>
+ <b>UDF considerations:</b> When writing a C++ UDF, use the <codeph>DecimalVal</codeph> data type defined in
+ <filepath>/usr/include/impala_udf/udf.h</filepath>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/partitioning_blurb"/>
+
+ <p>
+ You can use a <codeph>DECIMAL</codeph> column as a partition key. Doing so provides a better match between
+ the partition key values and the HDFS directory names than using a <codeph>DOUBLE</codeph> or
+ <codeph>FLOAT</codeph> partitioning column:
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/schema_evolution_blurb"/>
+
+ <ul>
+ <li>
+ For text-based formats (text, RCFile, and SequenceFile tables), you can issue an <codeph>ALTER TABLE ...
+ REPLACE COLUMNS</codeph> statement to change the precision and scale of an existing
+ <codeph>DECIMAL</codeph> column. As long as the values in the column fit within the new precision and
+ scale, they are returned correctly by a query. Any values that do not fit within the new precision and
+ scale are returned as <codeph>NULL</codeph>, and Impala reports the conversion error. Leading zeros do not
+ count against the precision value, but trailing zeros after the decimal point do.
+<codeblock>[localhost:21000] > create table text_decimals (x string);
+[localhost:21000] > insert into text_decimals values ("1"), ("2"), ("99.99"), ("1.234"), ("000001"), ("1.000000000");
+[localhost:21000] > select * from text_decimals;
++-------------+
+| x |
++-------------+
+| 1 |
+| 2 |
+| 99.99 |
+| 1.234 |
+| 000001 |
+| 1.000000000 |
++-------------+
+[localhost:21000] > alter table text_decimals replace columns (x decimal(4,2));
+[localhost:21000] > select * from text_decimals;
++-------+
+| x |
++-------+
+| 1.00 |
+| 2.00 |
+| 99.99 |
+| NULL |
+| 1.00 |
+| NULL |
++-------+
+ERRORS:
+Backend 0:Error converting column: 0 TO DECIMAL(4, 2) (Data is: 1.234)
+file: hdfs://127.0.0.1:8020/user/hive/warehouse/decimal_testing.db/text_decimals/634d4bd3aa0
+e8420-b4b13bab7f1be787_56794587_data.0
+record: 1.234
+Error converting column: 0 TO DECIMAL(4, 2) (Data is: 1.000000000)
+file: hdfs://127.0.0.1:8020/user/hive/warehouse/decimal_testing.db/text_decimals/cd40dc68e20
+c565a-cc4bd86c724c96ba_311873428_data.0
+record: 1.000000000
+</codeblock>
+ </li>
+
+ <li>
+ For binary formats (Parquet and Avro tables), although an <codeph>ALTER TABLE ... REPLACE COLUMNS</codeph>
+ statement that changes the precision or scale of a <codeph>DECIMAL</codeph> column succeeds, any subsequent
+ attempt to query the changed column results in a fatal error. (The other columns can still be queried
+ successfully.) This is because the metadata about the columns is stored in the data files themselves, and
+ <codeph>ALTER TABLE</codeph> does not actually make any updates to the data files. If the metadata in the
+ data files disagrees with the metadata in the metastore database, Impala cancels the query.
+ </li>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>CREATE TABLE t1 (x DECIMAL, y DECIMAL(5,2), z DECIMAL(25,0));
+INSERT INTO t1 VALUES (5, 99.44, 123456), (300, 6.7, 999999999);
+SELECT x+y, ROUND(y,1), z/98.6 FROM t1;
+SELECT CAST(1000.5 AS DECIMAL);
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/decimal_no_stats"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/partitioning_good"/> -->
+
+ <p conref="../shared/impala_common.xml#common/hbase_ok"/>
+
+ <p conref="../shared/impala_common.xml#common/parquet_ok"/>
+
+ <p conref="../shared/impala_common.xml#common/text_bulky"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/compatibility_blurb"/> -->
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/internals_blurb"/> -->
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/added_in_20"/> -->
+
+ <p conref="../shared/impala_common.xml#common/column_stats_constant"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_literals.xml#numeric_literals"/>, <xref href="impala_tinyint.xml#tinyint"/>,
+ <xref href="impala_smallint.xml#smallint"/>, <xref href="impala_int.xml#int"/>,
+ <xref href="impala_bigint.xml#bigint"/>, <xref href="impala_decimal.xml#decimal"/>,
+ <xref href="impala_math_functions.xml#math_functions"/> (especially <codeph>PRECISION()</codeph> and
+ <codeph>SCALE()</codeph>)
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_default_order_by_limit.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_default_order_by_limit.xml b/docs/topics/impala_default_order_by_limit.xml
new file mode 100644
index 0000000..def0335
--- /dev/null
+++ b/docs/topics/impala_default_order_by_limit.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="obwl" id="default_order_by_limit">
+
+ <title>DEFAULT_ORDER_BY_LIMIT Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p conref="../shared/impala_common.xml#common/obwl_query_options"/>
+
+ <p rev="1.4.0">
+ Prior to Impala 1.4.0, Impala queries that use the <codeph><xref href="impala_order_by.xml#order_by">ORDER
+ BY</xref></codeph> clause must also include a
+ <codeph><xref href="impala_limit.xml#limit">LIMIT</xref></codeph> clause, to avoid accidentally producing
+ huge result sets that must be sorted. Sorting a huge result set is a memory-intensive operation. In Impala
+ 1.4.0 and higher, Impala uses a temporary disk work area to perform the sort if that operation would
+ otherwise exceed the Impala memory limit on a particular host.
+ </p>
+
+ <p>
+ <b>Type: numeric</b>
+ </p>
+
+ <p>
+ <b>Default:</b> -1 (no default limit)
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_delete.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_delete.xml b/docs/topics/impala_delete.xml
new file mode 100644
index 0000000..fcac5e4
--- /dev/null
+++ b/docs/topics/impala_delete.xml
@@ -0,0 +1,64 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="delete">
+
+ <title>DELETE Statement (CDH 5.5 and higher only)</title>
+ <titlealts><navtitle>DELETE</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Kudu"/>
+ <data name="Category" value="ETL"/>
+ <data name="Category" value="Ingest"/>
+ <data name="Category" value="DML"/>
+ <data name="Category" value="Data Analysts"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">DELETE statement</indexterm>
+ Deletes one or more rows from a Kudu table.
+ Although deleting a single row or a range of rows would be inefficient for tables using HDFS
+ data files, Kudu is able to perform this operation efficiently. Therefore, this statement
+ only works for Impala tables that use the Kudu storage engine.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>
+</codeblock>
+
+ <p rev="kudu" audience="impala_next">
+ Normally, a <codeph>DELETE</codeph> operation for a Kudu table fails if
+ some partition key columns are not found, due to their being deleted or changed
+ by a concurrent <codeph>UPDATE</codeph> or <codeph>DELETE</codeph> operation.
+ Specify <codeph>DELETE IGNORE <varname>rest_of_statement</varname></codeph> to
+ make the <codeph>DELETE</codeph> continue in this case. The rows with the nonexistent
+ duplicate partition key column values are not removed.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/dml_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/>
+
+ <note conref="../shared/impala_common.xml#common/compute_stats_next"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+<codeblock>
+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_kudu.xml#impala_kudu"/>
+ </p>
+
+ </conbody>
+
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_describe.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_describe.xml b/docs/topics/impala_describe.xml
new file mode 100644
index 0000000..ffdb505
--- /dev/null
+++ b/docs/topics/impala_describe.xml
@@ -0,0 +1,561 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="describe">
+
+ <title id="desc">DESCRIBE Statement</title>
+ <titlealts><navtitle>DESCRIBE</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Data Types"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Tables"/>
+ <data name="Category" value="Reports"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">DESCRIBE statement</indexterm>
+ The <codeph>DESCRIBE</codeph> statement displays metadata about a table, such as the column names and their
+ data types. Its syntax is:
+ </p>
+
+<codeblock rev="2.3.0">DESCRIBE [FORMATTED] [<varname>db_name</varname>.]<varname>table_name</varname>[.<varname>complex_col_name</varname> ...]</codeblock>
+
+ <p>
+ You can use the abbreviation <codeph>DESC</codeph> for the <codeph>DESCRIBE</codeph> statement.
+ </p>
+
+ <p rev="1.1">
+ The <codeph>DESCRIBE FORMATTED</codeph> variation displays additional information, in a format familiar to
+ users of Apache Hive. The extra information includes low-level details such as whether the table is internal
+ or external, when it was created, the file format, the location of the data in HDFS, whether the object is a
+ table or a view, and (for views) the text of the query from the view definition.
+ </p>
+
+ <note>
+ The <codeph>Compressed</codeph> field is not a reliable indicator of whether the table contains compressed
+ data. It typically always shows <codeph>No</codeph>, because the compression settings only apply during the
+ session that loads data and are not stored persistently with the table metadata.
+ </note>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p rev="2.3.0">
+ For the <codeph>ARRAY</codeph>, <codeph>STRUCT</codeph>, and <codeph>MAP</codeph> types available in
+ CDH 5.5 / Impala 2.3 and higher, the <codeph>DESCRIBE</codeph> output is formatted to avoid
+ excessively long lines for multiple fields within a <codeph>STRUCT</codeph>, or a nested sequence of
+ complex types.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_describe"/>
+
+ <p rev="2.3.0">
+ For example, here is the <codeph>DESCRIBE</codeph> output for a table containing a single top-level column
+ of each complex type:
+ </p>
+
+<codeblock rev="2.3.0"><![CDATA[create table t1 (x int, a array<int>, s struct<f1: string, f2: bigint>, m map<string,int>) stored as parquet;
+
+describe t1;
++------+-----------------+---------+
+| name | type | comment |
++------+-----------------+---------+
+| x | int | |
+| a | array<int> | |
+| s | struct< | |
+| | f1:string, | |
+| | f2:bigint | |
+| | > | |
+| m | map<string,int> | |
++------+-----------------+---------+
+]]>
+</codeblock>
+
+ <p rev="2.3.0">
+ Here are examples showing how to <q>drill down</q> into the layouts of complex types, including
+ using multi-part names to examine the definitions of nested types.
+ The <codeph>< ></codeph> delimiters identify the columns with complex types;
+ these are the columns where you can descend another level to see the parts that make up
+ the complex type.
+ This technique helps you to understand the multi-part names you use as table references in queries
+ involving complex types, and the corresponding column names you refer to in the <codeph>SELECT</codeph> list.
+ These tables are from the <q>nested TPC-H</q> schema, shown in detail in
+ <xref href="impala_complex_types.xml#complex_sample_schema"/>.
+ </p>
+
+ <p>
+ The <codeph>REGION</codeph> table contains an <codeph>ARRAY</codeph> of <codeph>STRUCT</codeph>
+ elements:
+ </p>
+
+ <ul>
+ <li>
+ <p>
+ The first <codeph>DESCRIBE</codeph> specifies the table name, to display the definition
+ of each top-level column.
+ </p>
+ </li>
+ <li>
+ <p>
+ The second <codeph>DESCRIBE</codeph> specifies the name of a complex
+ column, <codeph>REGION.R_NATIONS</codeph>, showing that when you include the name of an <codeph>ARRAY</codeph>
+ column in a <codeph>FROM</codeph> clause, that table reference acts like a two-column table with
+ columns <codeph>ITEM</codeph> and <codeph>POS</codeph>.
+ </p>
+ </li>
+ <li>
+ <p>
+ The final <codeph>DESCRIBE</codeph> specifies the fully qualified name of the <codeph>ITEM</codeph> field,
+ to display the layout of its underlying <codeph>STRUCT</codeph> type in table format, with the fields
+ mapped to column names.
+ </p>
+ </li>
+ </ul>
+
+<codeblock rev="2.3.0"><![CDATA[
+-- #1: The overall layout of the entire table.
+describe region;
++-------------+-------------------------+---------+
+| name | type | comment |
++-------------+-------------------------+---------+
+| r_regionkey | smallint | |
+| r_name | string | |
+| r_comment | string | |
+| r_nations | array<struct< | |
+| | n_nationkey:smallint, | |
+| | n_name:string, | |
+| | n_comment:string | |
+| | >> | |
++-------------+-------------------------+---------+
+
+-- #2: The ARRAY column within the table.
+describe region.r_nations;
++------+-------------------------+---------+
+| name | type | comment |
++------+-------------------------+---------+
+| item | struct< | |
+| | n_nationkey:smallint, | |
+| | n_name:string, | |
+| | n_comment:string | |
+| | > | |
+| pos | bigint | |
++------+-------------------------+---------+
+
+-- #3: The STRUCT that makes up each ARRAY element.
+-- The fields of the STRUCT act like columns of a table.
+describe region.r_nations.item;
++-------------+----------+---------+
+| name | type | comment |
++-------------+----------+---------+
+| n_nationkey | smallint | |
+| n_name | string | |
+| n_comment | string | |
++-------------+----------+---------+
+]]>
+</codeblock>
+
+ <p>
+ The <codeph>CUSTOMER</codeph> table contains an <codeph>ARRAY</codeph> of <codeph>STRUCT</codeph>
+ elements, where one field in the <codeph>STRUCT</codeph> is another <codeph>ARRAY</codeph> of
+ <codeph>STRUCT</codeph> elements:
+ </p>
+ <ul>
+ <li>
+ <p>
+ Again, the initial <codeph>DESCRIBE</codeph> specifies only the table name.
+ </p>
+ </li>
+ <li>
+ <p>
+ The second <codeph>DESCRIBE</codeph> specifies the qualified name of the complex
+ column, <codeph>CUSTOMER.C_ORDERS</codeph>, showing how an <codeph>ARRAY</codeph>
+ is represented as a two-column table with columns <codeph>ITEM</codeph> and <codeph>POS</codeph>.
+ </p>
+ </li>
+ <li>
+ <p>
+ The third <codeph>DESCRIBE</codeph> specifies the qualified name of the <codeph>ITEM</codeph>
+ of the <codeph>ARRAY</codeph> column, to see the structure of the nested <codeph>ARRAY</codeph>.
+ Again, it has has two parts, <codeph>ITEM</codeph> and <codeph>POS</codeph>. Because the
+ <codeph>ARRAY</codeph> contains a <codeph>STRUCT</codeph>, the layout of the <codeph>STRUCT</codeph>
+ is shown.
+ </p>
+ </li>
+ <li>
+ <p>
+ The fourth and fifth <codeph>DESCRIBE</codeph> statements drill down into a <codeph>STRUCT</codeph> field that
+ is itself a complex type, an <codeph>ARRAY</codeph> of <codeph>STRUCT</codeph>.
+ The <codeph>ITEM</codeph> portion of the qualified name is only required when the <codeph>ARRAY</codeph>
+ elements are anonymous. The fields of the <codeph>STRUCT</codeph> give names to any other complex types
+ nested inside the <codeph>STRUCT</codeph>. Therefore, the <codeph>DESCRIBE</codeph> parameters
+ <codeph>CUSTOMER.C_ORDERS.ITEM.O_LINEITEMS</codeph> and <codeph>CUSTOMER.C_ORDERS.O_LINEITEMS</codeph>
+ are equivalent. (For brevity, Cloudera recommends leaving out the <codeph>ITEM</codeph> portion of
+ a qualified name when it is not required.)
+ </p>
+ </li>
+ <li>
+ <p>
+ The final <codeph>DESCRIBE</codeph> shows the layout of the deeply nested <codeph>STRUCT</codeph> type.
+ Because there are no more complex types nested inside this <codeph>STRUCT</codeph>, this is as far
+ as you can drill down into the layout for this table.
+ </p>
+ </li>
+ </ul>
+
+<codeblock rev="2.3.0"><![CDATA[-- #1: The overall layout of the entire table.
+describe customer;
++--------------+------------------------------------+
+| name | type |
++--------------+------------------------------------+
+| c_custkey | bigint |
+... more scalar columns ...
+| c_orders | array<struct< |
+| | o_orderkey:bigint, |
+| | o_orderstatus:string, |
+| | o_totalprice:decimal(12,2), |
+| | o_orderdate:string, |
+| | o_orderpriority:string, |
+| | o_clerk:string, |
+| | o_shippriority:int, |
+| | o_comment:string, |
+| | o_lineitems:array<struct< |
+| | l_partkey:bigint, |
+| | l_suppkey:bigint, |
+| | l_linenumber:int, |
+| | l_quantity:decimal(12,2), |
+| | l_extendedprice:decimal(12,2), |
+| | l_discount:decimal(12,2), |
+| | l_tax:decimal(12,2), |
+| | l_returnflag:string, |
+| | l_linestatus:string, |
+| | l_shipdate:string, |
+| | l_commitdate:string, |
+| | l_receiptdate:string, |
+| | l_shipinstruct:string, |
+| | l_shipmode:string, |
+| | l_comment:string |
+| | >> |
+| | >> |
++--------------+------------------------------------+
+
+-- #2: The ARRAY column within the table.
+describe customer.c_orders;
++------+------------------------------------+
+| name | type |
++------+------------------------------------+
+| item | struct< |
+| | o_orderkey:bigint, |
+| | o_orderstatus:string, |
+... more struct fields ...
+| | o_lineitems:array<struct< |
+| | l_partkey:bigint, |
+| | l_suppkey:bigint, |
+... more nested struct fields ...
+| | l_comment:string |
+| | >> |
+| | > |
+| pos | bigint |
++------+------------------------------------+
+
+-- #3: The STRUCT that makes up each ARRAY element.
+-- The fields of the STRUCT act like columns of a table.
+describe customer.c_orders.item;
++-----------------+----------------------------------+
+| name | type |
++-----------------+----------------------------------+
+| o_orderkey | bigint |
+| o_orderstatus | string |
+| o_totalprice | decimal(12,2) |
+| o_orderdate | string |
+| o_orderpriority | string |
+| o_clerk | string |
+| o_shippriority | int |
+| o_comment | string |
+| o_lineitems | array<struct< |
+| | l_partkey:bigint, |
+| | l_suppkey:bigint, |
+... more struct fields ...
+| | l_comment:string |
+| | >> |
++-----------------+----------------------------------+
+
+-- #4: The ARRAY nested inside the STRUCT elements of the first ARRAY.
+describe customer.c_orders.item.o_lineitems;
++------+----------------------------------+
+| name | type |
++------+----------------------------------+
+| item | struct< |
+| | l_partkey:bigint, |
+| | l_suppkey:bigint, |
+... more struct fields ...
+| | l_comment:string |
+| | > |
+| pos | bigint |
++------+----------------------------------+
+
+-- #5: Shorter form of the previous DESCRIBE. Omits the .ITEM portion of the name
+-- because O_LINEITEMS and other field names provide a way to refer to things
+-- inside the ARRAY element.
+describe customer.c_orders.o_lineitems;
++------+----------------------------------+
+| name | type |
++------+----------------------------------+
+| item | struct< |
+| | l_partkey:bigint, |
+| | l_suppkey:bigint, |
+... more struct fields ...
+| | l_comment:string |
+| | > |
+| pos | bigint |
++------+----------------------------------+
+
+-- #6: The STRUCT representing ARRAY elements nested inside
+-- another ARRAY of STRUCTs. The lack of any complex types
+-- in this output means this is as far as DESCRIBE can
+-- descend into the table layout.
+describe customer.c_orders.o_lineitems.item;
++-----------------+---------------+
+| name | type |
++-----------------+---------------+
+| l_partkey | bigint |
+| l_suppkey | bigint |
+... more scalar columns ...
+| l_comment | string |
++-----------------+---------------+
+]]>
+</codeblock>
+
+<p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+<p>
+ After the <cmdname>impalad</cmdname> daemons are restarted, the first query against a table can take longer
+ than subsequent queries, because the metadata for the table is loaded before the query is processed. This
+ one-time delay for each table can cause misleading results in benchmark tests or cause unnecessary concern.
+ To <q>warm up</q> the Impala metadata cache, you can issue a <codeph>DESCRIBE</codeph> statement in advance
+ for each table you intend to access later.
+</p>
+
+<p>
+ When you are dealing with data files stored in HDFS, sometimes it is important to know details such as the
+ path of the data files for an Impala table, and the host name for the namenode. You can get this information
+ from the <codeph>DESCRIBE FORMATTED</codeph> output. You specify HDFS URIs or path specifications with
+ statements such as <codeph>LOAD DATA</codeph> and the <codeph>LOCATION</codeph> clause of <codeph>CREATE
+ TABLE</codeph> or <codeph>ALTER TABLE</codeph>. You might also use HDFS URIs or paths with Linux commands
+ such as <cmdname>hadoop</cmdname> and <cmdname>hdfs</cmdname> to copy, rename, and so on, data files in HDFS.
+</p>
+
+<p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/>
+
+<p rev="1.2.1">
+ Each table can also have associated table statistics and column statistics. To see these categories of
+ information, use the <codeph>SHOW TABLE STATS <varname>table_name</varname></codeph> and <codeph>SHOW COLUMN
+ STATS <varname>table_name</varname></codeph> statements.
+<!--
+For example, the table statistics can often show you the number
+and total size of the files in the table, even if you have not
+run <codeph>COMPUTE STATS</codeph>.
+-->
+ See <xref href="impala_show.xml#show"/> for details.
+</p>
+
+<p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+<p rev="2.3.0">
+ Because the column definitions for complex types can become long, particularly when such types are nested,
+ the <codeph>DESCRIBE</codeph> statement uses special formatting for complex type columns to make the output readable.
+</p>
+
+<note conref="../shared/impala_common.xml#common/compute_stats_next"/>
+
+<p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<p>
+ The following example shows the results of both a standard <codeph>DESCRIBE</codeph> and <codeph>DESCRIBE
+ FORMATTED</codeph> for different kinds of schema objects:
+</p>
+
+<ul>
+ <li>
+ <codeph>DESCRIBE</codeph> for a table or a view returns the name, type, and comment for each of the
+ columns. For a view, if the column value is computed by an expression, the column name is automatically
+ generated as <codeph>_c0</codeph>, <codeph>_c1</codeph>, and so on depending on the ordinal number of the
+ column.
+ </li>
+
+ <li>
+ A table created with no special format or storage clauses is designated as a <codeph>MANAGED_TABLE</codeph>
+ (an <q>internal table</q> in Impala terminology). Its data files are stored in an HDFS directory under the
+ default Hive data directory. By default, it uses Text data format.
+ </li>
+
+ <li>
+ A view is designated as <codeph>VIRTUAL_VIEW</codeph> in <codeph>DESCRIBE FORMATTED</codeph> output. Some
+ of its properties are <codeph>NULL</codeph> or blank because they are inherited from the base table. The
+ text of the query that defines the view is part of the <codeph>DESCRIBE FORMATTED</codeph> output.
+ </li>
+
+ <li>
+ A table with additional clauses in the <codeph>CREATE TABLE</codeph> statement has differences in
+ <codeph>DESCRIBE FORMATTED</codeph> output. The output for <codeph>T2</codeph> includes the
+ <codeph>EXTERNAL_TABLE</codeph> keyword because of the <codeph>CREATE EXTERNAL TABLE</codeph> syntax, and
+ different <codeph>InputFormat</codeph> and <codeph>OutputFormat</codeph> fields to reflect the Parquet file
+ format.
+ </li>
+ </ul>
+
+<codeblock>[localhost:21000] > create table t1 (x int, y int, s string);
+Query: create table t1 (x int, y int, s string)
+[localhost:21000] > describe t1;
+Query: describe t1
+Query finished, fetching results ...
++------+--------+---------+
+| name | type | comment |
++------+--------+---------+
+| x | int | |
+| y | int | |
+| s | string | |
++------+--------+---------+
+Returned 3 row(s) in 0.13s
+[localhost:21000] > describe formatted t1;
+Query: describe formatted t1
+Query finished, fetching results ...
++------------------------------+--------------------------------------------------------------------+----------------------+
+| name | type | comment |
++------------------------------+--------------------------------------------------------------------+----------------------+
+| # col_name | data_type | comment |
+| | NULL | NULL |
+| x | int | None |
+| y | int | None |
+| s | string | None |
+| | NULL | NULL |
+| # Detailed Table Information | NULL | NULL |
+| Database: | describe_formatted | NULL |
+| Owner: | cloudera | NULL |
+| CreateTime: | Mon Jul 22 17:03:16 EDT 2013 | NULL |
+| LastAccessTime: | UNKNOWN | NULL |
+| Protect Mode: | None | NULL |
+| Retention: | 0 | NULL |
+| Location: | hdfs://127.0.0.1:8020/user/hive/warehouse/describe_formatted.db/t1 | NULL |
+| Table Type: | MANAGED_TABLE | NULL |
+| Table Parameters: | NULL | NULL |
+| | transient_lastDdlTime | 1374526996 |
+| | NULL | NULL |
+| # Storage Information | NULL | NULL |
+| SerDe Library: | org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe | NULL |
+| InputFormat: | org.apache.hadoop.mapred.TextInputFormat | NULL |
+| OutputFormat: | org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat | NULL |
+| Compressed: | No | NULL |
+| Num Buckets: | 0 | NULL |
+| Bucket Columns: | [] | NULL |
+| Sort Columns: | [] | NULL |
++------------------------------+--------------------------------------------------------------------+----------------------+
+Returned 26 row(s) in 0.03s
+[localhost:21000] > create view v1 as select x, upper(s) from t1;
+Query: create view v1 as select x, upper(s) from t1
+[localhost:21000] > describe v1;
+Query: describe v1
+Query finished, fetching results ...
++------+--------+---------+
+| name | type | comment |
++------+--------+---------+
+| x | int | |
+| _c1 | string | |
++------+--------+---------+
+Returned 2 row(s) in 0.10s
+[localhost:21000] > describe formatted v1;
+Query: describe formatted v1
+Query finished, fetching results ...
++------------------------------+------------------------------+----------------------+
+| name | type | comment |
++------------------------------+------------------------------+----------------------+
+| # col_name | data_type | comment |
+| | NULL | NULL |
+| x | int | None |
+| _c1 | string | None |
+| | NULL | NULL |
+| # Detailed Table Information | NULL | NULL |
+| Database: | describe_formatted | NULL |
+| Owner: | cloudera | NULL |
+| CreateTime: | Mon Jul 22 16:56:38 EDT 2013 | NULL |
+| LastAccessTime: | UNKNOWN | NULL |
+| Protect Mode: | None | NULL |
+| Retention: | 0 | NULL |
+| Table Type: | VIRTUAL_VIEW | NULL |
+| Table Parameters: | NULL | NULL |
+| | transient_lastDdlTime | 1374526598 |
+| | NULL | NULL |
+| # Storage Information | NULL | NULL |
+| SerDe Library: | null | NULL |
+| InputFormat: | null | NULL |
+| OutputFormat: | null | NULL |
+| Compressed: | No | NULL |
+| Num Buckets: | 0 | NULL |
+| Bucket Columns: | [] | NULL |
+| Sort Columns: | [] | NULL |
+| | NULL | NULL |
+| # View Information | NULL | NULL |
+| View Original Text: | SELECT x, upper(s) FROM t1 | NULL |
+| View Expanded Text: | SELECT x, upper(s) FROM t1 | NULL |
++------------------------------+------------------------------+----------------------+
+Returned 28 row(s) in 0.03s
+[localhost:21000] > create external table t2 (x int, y int, s string) stored as parquet location '/user/cloudera/sample_data';
+[localhost:21000] > describe formatted t2;
+Query: describe formatted t2
+Query finished, fetching results ...
++------------------------------+----------------------------------------------------+----------------------+
+| name | type | comment |
++------------------------------+----------------------------------------------------+----------------------+
+| # col_name | data_type | comment |
+| | NULL | NULL |
+| x | int | None |
+| y | int | None |
+| s | string | None |
+| | NULL | NULL |
+| # Detailed Table Information | NULL | NULL |
+| Database: | describe_formatted | NULL |
+| Owner: | cloudera | NULL |
+| CreateTime: | Mon Jul 22 17:01:47 EDT 2013 | NULL |
+| LastAccessTime: | UNKNOWN | NULL |
+| Protect Mode: | None | NULL |
+| Retention: | 0 | NULL |
+| Location: | hdfs://127.0.0.1:8020/user/cloudera/sample_data | NULL |
+| Table Type: | EXTERNAL_TABLE | NULL |
+| Table Parameters: | NULL | NULL |
+| | EXTERNAL | TRUE |
+| | transient_lastDdlTime | 1374526907 |
+| | NULL | NULL |
+| # Storage Information | NULL | NULL |
+| SerDe Library: | org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe | NULL |
+| InputFormat: | com.cloudera.impala.hive.serde.ParquetInputFormat | NULL |
+| OutputFormat: | com.cloudera.impala.hive.serde.ParquetOutputFormat | NULL |
+| Compressed: | No | NULL |
+| Num Buckets: | 0 | NULL |
+| Bucket Columns: | [] | NULL |
+| Sort Columns: | [] | NULL |
++------------------------------+----------------------------------------------------+----------------------+
+Returned 27 row(s) in 0.17s</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
+ <p rev="CDH-19187">
+ The user ID that the <cmdname>impalad</cmdname> daemon runs under,
+ typically the <codeph>impala</codeph> user, must have read and execute
+ permissions for all directories that are part of the table.
+ (A table could span multiple different HDFS directories if it is partitioned.
+ The directories could be widely scattered because a partition can reside
+ in an arbitrary HDFS directory based on its <codeph>LOCATION</codeph> attribute.)
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_tables.xml#tables"/>, <xref href="impala_create_table.xml#create_table"/>,
+ <xref href="impala_show.xml#show_tables"/>, <xref href="impala_show.xml#show_create_table"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_disable_codegen.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_disable_codegen.xml b/docs/topics/impala_disable_codegen.xml
new file mode 100644
index 0000000..844d49d
--- /dev/null
+++ b/docs/topics/impala_disable_codegen.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="disable_codegen">
+
+ <title>DISABLE_CODEGEN Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ <data name="Category" value="Troubleshooting"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">DISABLE_CODEGEN query option</indexterm>
+ This is a debug option, intended for diagnosing and working around issues that cause crashes. If a query
+ fails with an <q>illegal instruction</q> or other hardware-specific message, try setting
+ <codeph>DISABLE_CODEGEN=true</codeph> and running the query again. If the query succeeds only when the
+ <codeph>DISABLE_CODEGEN</codeph> option is turned on, submit the problem to Cloudera support and include that
+ detail in the problem report. Do not otherwise run with this setting turned on, because it results in lower
+ overall performance.
+ </p>
+
+ <p>
+ Because the code generation phase adds a small amount of overhead for each query, you might turn on the
+ <codeph>DISABLE_CODEGEN</codeph> option to achieve maximum throughput when running many short-lived queries
+ against small tables.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/type_boolean"/>
+ <p conref="../shared/impala_common.xml#common/default_false_0"/>
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_disable_unsafe_spills.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_disable_unsafe_spills.xml b/docs/topics/impala_disable_unsafe_spills.xml
new file mode 100644
index 0000000..f251d65
--- /dev/null
+++ b/docs/topics/impala_disable_unsafe_spills.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="2.0.0" id="disable_unsafe_spills">
+
+ <title>DISABLE_UNSAFE_SPILLS Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ <data name="Category" value="Scalability"/>
+ <data name="Category" value="Memory"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">DISABLE_UNSAFE_SPILLS query option</indexterm>
+ Enable this option if you prefer to have queries fail when they exceed the Impala memory limit, rather than
+ write temporary data to disk.
+ </p>
+
+ <p>
+ Queries that <q>spill</q> to disk typically complete successfully, when in earlier Impala releases they would have failed.
+ However, queries with exorbitant memory requirements due to missing statistics or inefficient join clauses could
+ become so slow as a result that you would rather have them cancelled automatically and reduce the memory
+ usage through standard Impala tuning techniques.
+ </p>
+
+ <p>
+ This option prevents only <q>unsafe</q> spill operations, meaning that one or more tables are missing
+ statistics or the query does not include a hint to set the most efficient mechanism for a join or
+ <codeph>INSERT ... SELECT</codeph> into a partitioned table. These are the tables most likely to result in
+ suboptimal execution plans that could cause unnecessary spilling. Therefore, leaving this option enabled is a
+ good way to find tables on which to run the <codeph>COMPUTE STATS</codeph> statement.
+ </p>
+
+ <p>
+ See <xref href="impala_scalability.xml#spill_to_disk"/> for information about the <q>spill to disk</q>
+ feature for queries processing large result sets with joins, <codeph>ORDER BY</codeph>, <codeph>GROUP
+ BY</codeph>, <codeph>DISTINCT</codeph>, aggregation functions, or analytic functions.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/type_boolean"/>
+ <p conref="../shared/impala_common.xml#common/default_false_0"/>
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_distinct.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_distinct.xml b/docs/topics/impala_distinct.xml
new file mode 100644
index 0000000..d49e400
--- /dev/null
+++ b/docs/topics/impala_distinct.xml
@@ -0,0 +1,59 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="distinct">
+
+ <title>DISTINCT Operator</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Aggregate Functions"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">DISTINCT operator</indexterm>
+ The <codeph>DISTINCT</codeph> operator in a <codeph>SELECT</codeph> statement filters the result set to
+ remove duplicates:
+ </p>
+
+<codeblock>-- Returns the unique values from one column.
+-- NULL is included in the set of values if any rows have a NULL in this column.
+select distinct c_birth_country from customer;
+-- Returns the unique combinations of values from multiple columns.
+select distinct c_salutation, c_last_name from customer;</codeblock>
+
+ <p>
+ You can use <codeph>DISTINCT</codeph> in combination with an aggregation function, typically
+ <codeph>COUNT()</codeph>, to find how many different values a column contains:
+ </p>
+
+<codeblock>-- Counts the unique values from one column.
+-- NULL is not included as a distinct value in the count.
+select count(distinct c_birth_country) from customer;
+-- Counts the unique combinations of values from multiple columns.
+select count(distinct c_salutation, c_last_name) from customer;</codeblock>
+
+ <p>
+ One construct that Impala SQL does <i>not</i> support is using <codeph>DISTINCT</codeph> in more than one
+ aggregation function in the same query. For example, you could not have a single query with both
+ <codeph>COUNT(DISTINCT c_first_name)</codeph> and <codeph>COUNT(DISTINCT c_last_name)</codeph> in the
+ <codeph>SELECT</codeph> list.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/zero_length_strings"/>
+
+ <note conref="../shared/impala_common.xml#common/multiple_count_distinct"/>
+
+ <note>
+ <p>
+ In contrast with some database systems that always return <codeph>DISTINCT</codeph> values in sorted order,
+ Impala does not do any ordering of <codeph>DISTINCT</codeph> values. Always include an <codeph>ORDER
+ BY</codeph> clause if you need the values in alphabetical or numeric sorted order.
+ </p>
+ </note>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_dml.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_dml.xml b/docs/topics/impala_dml.xml
new file mode 100644
index 0000000..66d4022
--- /dev/null
+++ b/docs/topics/impala_dml.xml
@@ -0,0 +1,85 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="dml">
+
+ <title>DML Statements</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="DML"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Tables"/>
+ <data name="Category" value="ETL"/>
+ <data name="Category" value="Ingest"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ DML refers to <q>Data Manipulation Language</q>, a subset of SQL statements that modify the data stored in
+ tables. Because Impala focuses on query performance and leverages the append-only nature of HDFS storage,
+ currently Impala only supports a small set of DML statements:
+ </p>
+
+ <ul>
+ <li audience="impala_next">
+ <xref href="impala_delete.xml#delete"/>; works for Kudu tables only
+ </li>
+
+ <li>
+ <xref href="impala_insert.xml#insert"/>
+ </li>
+
+ <li>
+ <xref href="impala_load_data.xml#load_data"/>
+ </li>
+
+ <li audience="impala_next">
+ <xref href="impala_update.xml#update"/>; works for Kudu tables only
+ </li>
+ </ul>
+
+ <p>
+ <codeph>INSERT</codeph> in Impala is primarily optimized for inserting large volumes of data in a single
+ statement, to make effective use of the multi-megabyte HDFS blocks. This is the way in Impala to create new
+ data files. If you intend to insert one or a few rows at a time, such as using the <codeph>INSERT ...
+ VALUES</codeph> syntax, that technique is much more efficient for Impala tables stored in HBase. See
+ <xref href="impala_hbase.xml#impala_hbase"/> for details.
+ </p>
+
+ <p>
+ <codeph>LOAD DATA</codeph> moves existing data files into the directory for an Impala table, making them
+ immediately available for Impala queries. This is one way in Impala to work with data files produced by other
+ Hadoop components. (<codeph>CREATE EXTERNAL TABLE</codeph> is the other alternative; with external tables,
+ you can query existing data files, while the files remain in their original location.)
+ </p>
+
+ <p>
+ To simulate the effects of an <codeph>UPDATE</codeph> or <codeph>DELETE</codeph> statement in other database
+ systems, typically you use <codeph>INSERT</codeph> or <codeph>CREATE TABLE AS SELECT</codeph> to copy data
+ from one table to another, filtering out or changing the appropriate rows during the copy operation.
+ </p>
+
+ <p>
+ Although Impala currently does not have an <codeph>UPDATE</codeph> statement, you can achieve a similar
+ result by using Impala tables stored in HBase. When you insert a row into an HBase table, and the table
+ already contains a row with the same value for the key column, the older row is hidden, effectively the same
+ as a single-row <codeph>UPDATE</codeph>.
+ </p>
+
+ <p rev="2.2.0">
+ Currently, Impala cannot perform DML operations for tables or partitions stored in the Amazon S3 filesystem.
+ See <xref href="impala_s3.xml#s3"/> for details.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ The other major classifications of SQL statements are data definition language (see
+ <xref href="impala_ddl.xml#ddl"/>) and queries (see <xref href="impala_select.xml#select"/>).
+ </p>
+ </conbody>
+</concept>
[15/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Posted by jr...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_create_function.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_create_function.xml b/docs/topics/impala_create_function.xml
new file mode 100644
index 0000000..4140289
--- /dev/null
+++ b/docs/topics/impala_create_function.xml
@@ -0,0 +1,291 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.2" id="create_function">
+
+ <title>CREATE FUNCTION Statement</title>
+ <titlealts><navtitle>CREATE FUNCTION</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="DDL"/>
+ <data name="Category" value="Schemas"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="UDFs"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">CREATE FUNCTION statement</indexterm>
+ Creates a user-defined function (UDF), which you can use to implement custom logic during
+ <codeph>SELECT</codeph> or <codeph>INSERT</codeph> operations.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+ <p>
+ The syntax is different depending on whether you create a scalar UDF, which is called once for each row and
+ implemented by a single function, or a user-defined aggregate function (UDA), which is implemented by
+ multiple functions that compute intermediate results across sets of rows.
+ </p>
+
+ <p>
+ To create a scalar UDF, issue a <codeph>CREATE FUNCTION</codeph> statement:
+ </p>
+
+<codeblock>CREATE FUNCTION [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>function_name</varname>([<varname>arg_type</varname>[, <varname>arg_type</varname>...])
+ RETURNS <varname>return_type</varname>
+ LOCATION '<varname>hdfs_path</varname>'
+ SYMBOL='<varname>symbol_or_class</varname>'</codeblock>
+
+ <p>
+ To create a UDA, issue a <codeph>CREATE AGGREGATE FUNCTION</codeph> statement:
+ </p>
+
+<codeblock>CREATE [AGGREGATE] FUNCTION [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>function_name</varname>([<varname>arg_type</varname>[, <varname>arg_type</varname>...])
+ RETURNS <varname>return_type</varname>
+ LOCATION '<varname>hdfs_path</varname>'
+ [INIT_FN='<varname>function</varname>]
+ UPDATE_FN='<varname>function</varname>
+ MERGE_FN='<varname>function</varname>
+ [PREPARE_FN='<varname>function</varname>]
+ [CLOSEFN='<varname>function</varname>]
+ <ph rev="2.0.0">[SERIALIZE_FN='<varname>function</varname>]</ph>
+ [FINALIZE_FN='<varname>function</varname>]
+<!-- [INTERMEDIATE <varname>type_spec</varname>] --></codeblock>
+
+ <p conref="../shared/impala_common.xml#common/ddl_blurb"/>
+
+ <p>
+ <b>Varargs notation:</b>
+ </p>
+
+ <p>
+ If the underlying implementation of your function accepts a variable number of arguments:
+ </p>
+
+ <ul>
+ <li>
+ The variable arguments must go last in the argument list.
+ </li>
+
+ <li>
+ The variable arguments must all be of the same type.
+ </li>
+
+ <li>
+ You must include at least one instance of the variable arguments in every function call invoked from SQL.
+ </li>
+
+ <li>
+ You designate the variable portion of the argument list in the <codeph>CREATE FUNCTION</codeph> statement
+ by including <codeph>...</codeph> immediately after the type name of the first variable argument. For
+ example, to create a function that accepts an <codeph>INT</codeph> argument, followed by a
+ <codeph>BOOLEAN</codeph>, followed by one or more <codeph>STRING</codeph> arguments, your <codeph>CREATE
+ FUNCTION</codeph> statement would look like:
+<codeblock>CREATE FUNCTION <varname>func_name</varname> (INT, BOOLEAN, STRING ...)
+ RETURNS <varname>type</varname> LOCATION '<varname>path</varname>' SYMBOL='<varname>entry_point</varname>';
+</codeblock>
+ </li>
+ </ul>
+
+ <p>
+ See <xref href="impala_udf.xml#udf_varargs"/> for how to code the C++ or Java function to accept
+ variable-length argument lists.
+ </p>
+
+ <p>
+ <b>Scalar and aggregate functions:</b>
+ </p>
+
+ <p>
+ The simplest kind of user-defined function returns a single scalar value each time it is called, typically
+ once for each row in the result set. This general kind of function is what is usually meant by UDF.
+ User-defined aggregate functions (UDAs) are a specialized kind of UDF that produce a single value based on
+ the contents of multiple rows. You usually use UDAs in combination with a <codeph>GROUP BY</codeph> clause to
+ condense a large result set into a smaller one, or even a single row summarizing column values across an
+ entire table.
+ </p>
+
+ <p>
+ You create UDAs by using the <codeph>CREATE AGGREGATE FUNCTION</codeph> syntax. The clauses
+ <codeph>INIT_FN</codeph>, <codeph>UPDATE_FN</codeph>, <codeph>MERGE_FN</codeph>,
+ <ph rev="2.0.0"><codeph>SERIALIZE_FN</codeph>,</ph> <codeph>FINALIZE_FN</codeph>, and
+ <codeph>INTERMEDIATE</codeph> only apply when you create a UDA rather than a scalar UDF.
+ </p>
+
+ <p>
+ The <codeph>*_FN</codeph> clauses specify functions to call at different phases of function processing.
+ </p>
+
+ <ul>
+ <li>
+ <b>Initialize:</b> The function you specify with the <codeph>INIT_FN</codeph> clause does any initial
+ setup, such as initializing member variables in internal data structures. This function is often a stub for
+ simple UDAs. You can omit this clause and a default (no-op) function will be used.
+ </li>
+
+ <li>
+ <b>Update:</b> The function you specify with the <codeph>UPDATE_FN</codeph> clause is called once for each
+ row in the original result set, that is, before any <codeph>GROUP BY</codeph> clause is applied. A separate
+ instance of the function is called for each different value returned by the <codeph>GROUP BY</codeph>
+ clause. The final argument passed to this function is a pointer, to which you write an updated value based
+ on its original value and the value of the first argument.
+ </li>
+
+ <li>
+ <b>Merge:</b> The function you specify with the <codeph>MERGE_FN</codeph> clause is called an arbitrary
+ number of times, to combine intermediate values produced by different nodes or different threads as Impala
+ reads and processes data files in parallel. The final argument passed to this function is a pointer, to
+ which you write an updated value based on its original value and the value of the first argument.
+ </li>
+
+ <li rev="2.0.0">
+ <b>Serialize:</b> The function you specify with the <codeph>SERIALIZE_FN</codeph> clause frees memory
+ allocated to intermediate results. It is required if any memory was allocated by the Allocate function in
+ the Init, Update, or Merge functions, or if the intermediate type contains any pointers. See
+ <xref href="https://github.com/cloudera/impala-udf-samples/blob/master/uda-sample.cc" scope="external" format="html">the
+ UDA code samples</xref> for details.
+ </li>
+
+ <li>
+ <b>Finalize:</b> The function you specify with the <codeph>FINALIZE_FN</codeph> clause does any required
+ teardown for resources acquired by your UDF, such as freeing memory, closing file handles if you explicitly
+ opened any files, and so on. This function is often a stub for simple UDAs. You can omit this clause and a
+ default (no-op) function will be used. It is required in UDAs where the final return type is different than
+ the intermediate type. or if any memory was allocated by the Allocate function in the Init, Update, or
+ Merge functions. See
+ <xref href="https://github.com/cloudera/impala-udf-samples/blob/master/uda-sample.cc" scope="external" format="html">the
+ UDA code samples</xref> for details.
+ </li>
+ </ul>
+
+ <p>
+ If you use a consistent naming convention for each of the underlying functions, Impala can automatically
+ determine the names based on the first such clause, so the others are optional.
+ </p>
+
+ <p audience="Cloudera">
+ The <codeph>INTERMEDIATE</codeph> clause specifies the data type of intermediate values passed from the
+ <q>update</q> phase to the <q>merge</q> phase, and from the <q>merge</q> phase to the <q>finalize</q> phase.
+ You can use any of the existing Impala data types, or the special notation
+ <codeph>CHAR(<varname>n</varname>)</codeph> to allocate a scratch area of <varname>n</varname> bytes for the
+ intermediate result. For example, if the different phases of your UDA pass strings to each other but in the
+ end the function returns a <codeph>BIGINT</codeph> value, you would specify <codeph>INTERMEDIATE
+ STRING</codeph>. Likewise, if the different phases of your UDA pass 2 separate <codeph>BIGINT</codeph> values
+ between them (8 bytes each), you would specify <codeph>INTERMEDIATE CHAR(16)</codeph> so that each function
+ could read from and write to a 16-byte buffer.
+ </p>
+
+ <p>
+ For end-to-end examples of UDAs, see <xref href="impala_udf.xml#udfs"/>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/udfs_no_complex_types"/>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <ul>
+ <li>
+ You can write Impala UDFs in either C++ or Java. C++ UDFs are new to Impala, and are the recommended format
+ for high performance utilizing native code. Java-based UDFs are compatible between Impala and Hive, and are
+ most suited to reusing existing Hive UDFs. (Impala can run Java-based Hive UDFs but not Hive UDAs.)
+ </li>
+
+ <li>
+ The body of the UDF is represented by a <codeph>.so</codeph> or <codeph>.jar</codeph> file, which you store
+ in HDFS and the <codeph>CREATE FUNCTION</codeph> statement distributes to each Impala node.
+ </li>
+
+ <li>
+ Impala calls the underlying code during SQL statement evaluation, as many times as needed to process all
+ the rows from the result set. All UDFs are assumed to be deterministic, that is, to always return the same
+ result when passed the same argument values. Impala might or might not skip some invocations of a UDF if
+ the result value is already known from a previous call. Therefore, do not rely on the UDF being called a
+ specific number of times, and do not return different result values based on some external factor such as
+ the current time, a random number function, or an external data source that could be updated while an
+ Impala query is in progress.
+ </li>
+
+ <li>
+ The names of the function arguments in the UDF are not significant, only their number, positions, and data
+ types.
+ </li>
+
+ <li>
+ You can overload the same function name by creating multiple versions of the function, each with a
+ different argument signature. For security reasons, you cannot make a UDF with the same name as any
+ built-in function.
+ </li>
+
+ <li>
+ In the UDF code, you represent the function return result as a <codeph>struct</codeph>. This
+ <codeph>struct</codeph> contains 2 fields. The first field is a <codeph>boolean</codeph> representing
+ whether the value is <codeph>NULL</codeph> or not. (When this field is <codeph>true</codeph>, the return
+ value is interpreted as <codeph>NULL</codeph>.) The second field is the same type as the specified function
+ return type, and holds the return value when the function returns something other than
+ <codeph>NULL</codeph>.
+ </li>
+
+ <li>
+ In the UDF code, you represent the function arguments as an initial pointer to a UDF context structure,
+ followed by references to zero or more <codeph>struct</codeph>s, corresponding to each of the arguments.
+ Each <codeph>struct</codeph> has the same 2 fields as with the return value, a <codeph>boolean</codeph>
+ field representing whether the argument is <codeph>NULL</codeph>, and a field of the appropriate type
+ holding any non-<codeph>NULL</codeph> argument value.
+ </li>
+
+ <li>
+ For sample code and build instructions for UDFs,
+ see <xref href="https://github.com/cloudera/impala/tree/master/be/src/udf_samples" scope="external" format="html">the sample UDFs in the Impala github repo</xref>.
+ </li>
+
+ <li>
+ Because the file representing the body of the UDF is stored in HDFS, it is automatically available to all
+ the Impala nodes. You do not need to manually copy any UDF-related files between servers.
+ </li>
+
+ <li>
+ Because Impala currently does not have any <codeph>ALTER FUNCTION</codeph> statement, if you need to rename
+ a function, move it to a different database, or change its signature or other properties, issue a
+ <codeph>DROP FUNCTION</codeph> statement for the original function followed by a <codeph>CREATE
+ FUNCTION</codeph> with the desired properties.
+ </li>
+
+ <li>
+ Because each UDF is associated with a particular database, either issue a <codeph>USE</codeph> statement
+ before doing any <codeph>CREATE FUNCTION</codeph> statements, or specify the name of the function as
+ <codeph><varname>db_name</varname>.<varname>function_name</varname></codeph>.
+ </li>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/compatibility_blurb"/>
+
+ <p>
+ Impala can run UDFs that were created through Hive, as long as they refer to Impala-compatible data types
+ (not composite or nested column types). Hive can run Java-based UDFs that were created through Impala, but
+ not Impala UDFs written in C++.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/udf_persistence_restriction"/>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_udf.xml#udfs"/> for more background information, usage instructions, and examples for
+ Impala UDFs; <xref href="impala_drop_function.xml#drop_function"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_create_role.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_create_role.xml b/docs/topics/impala_create_role.xml
new file mode 100644
index 0000000..975ce15
--- /dev/null
+++ b/docs/topics/impala_create_role.xml
@@ -0,0 +1,66 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.4.0" id="create_role">
+
+ <title>CREATE ROLE Statement (CDH 5.2 or higher only)</title>
+ <titlealts><navtitle>CREATE ROLE (CDH 5.2 or higher only)</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="DDL"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Sentry"/>
+ <data name="Category" value="Roles"/>
+ <!-- Consider whether to go deeper into categories like Security for the Sentry-related statements. -->
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">CREATE ROLE statement</indexterm>
+<!-- Copied from Sentry docs. Turn into conref. -->
+ The <codeph>CREATE ROLE</codeph> statement creates a role to which privileges can be granted. Privileges can
+ be granted to roles, which can then be assigned to users. A user that has been assigned a role will only be
+ able to exercise the privileges of that role. Only users that have administrative privileges can create/drop
+ roles. By default, the <codeph>hive</codeph>, <codeph>impala</codeph> and <codeph>hue</codeph> users have
+ administrative privileges in Sentry.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>CREATE ROLE <varname>role_name</varname>
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/privileges_blurb"/>
+
+ <p>
+ Only administrative users (those with <codeph>ALL</codeph> privileges on the server, defined in the Sentry
+ policy file) can use this statement.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/compatibility_blurb"/>
+
+ <p>
+ Impala makes use of any roles and privileges specified by the <codeph>GRANT</codeph> and
+ <codeph>REVOKE</codeph> statements in Hive, and Hive makes use of any roles and privileges specified by the
+ <codeph>GRANT</codeph> and <codeph>REVOKE</codeph> statements in Impala. The Impala <codeph>GRANT</codeph>
+ and <codeph>REVOKE</codeph> statements for privileges do not require the <codeph>ROLE</codeph> keyword to be
+ repeated before each role name, unlike the equivalent Hive statements.
+ </p>
+
+<!-- To do: nail down the new SHOW syntax, e.g. SHOW ROLES, SHOW CURRENT ROLES, SHOW GROUPS. -->
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_authorization.xml#authorization"/>, <xref href="impala_grant.xml#grant"/>,
+ <xref href="impala_revoke.xml#revoke"/>, <xref href="impala_drop_role.xml#drop_role"/>,
+ <xref href="impala_show.xml#show"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_create_table.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_create_table.xml b/docs/topics/impala_create_table.xml
new file mode 100644
index 0000000..cdaee4a
--- /dev/null
+++ b/docs/topics/impala_create_table.xml
@@ -0,0 +1,650 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="create_table">
+
+ <title>CREATE TABLE Statement</title>
+ <titlealts><navtitle>CREATE TABLE</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="DDL"/>
+ <data name="Category" value="Impala Data Types"/>
+ <data name="Category" value="HDFS Caching"/>
+ <data name="Category" value="Tables"/>
+ <data name="Category" value="Schemas"/>
+ <data audience="impala_next" name="Category" value="Kudu"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">CREATE TABLE statement</indexterm>
+ Creates a new table and specifies its characteristics. While creating a table, you optionally specify aspects
+ such as:
+ </p>
+
+ <ul>
+ <li>
+ Whether the table is internal or external.
+ </li>
+
+ <li>
+ The columns and associated data types.
+ </li>
+
+ <li>
+ The columns used for physically partitioning the data.
+ </li>
+
+ <li>
+ The file format for data files.
+ </li>
+
+ <li>
+ The HDFS directory where the data files are located.
+ </li>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+ <p>
+ The general syntax for creating a table and specifying its columns is as follows:
+ </p>
+
+ <p>
+ <b>Explicit column definitions:</b>
+ </p>
+
+<codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname>
+ [(<varname>col_name</varname> <varname>data_type</varname> [COMMENT '<varname>col_comment</varname>'], ...)]
+ [COMMENT '<varname>table_comment</varname>']
+ [PARTITIONED BY (<varname>col_name</varname> <varname>data_type</varname> [COMMENT '<varname>col_comment</varname>'], ...)]
+ [WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
+ [
+ [ROW FORMAT <varname>row_format</varname>] [STORED AS <varname>file_format</varname>]
+ ]
+ [LOCATION '<varname>hdfs_path</varname>']
+ [TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
+<ph rev="1.4.0"> [CACHED IN '<varname>pool_name</varname>'</ph> <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED]
+</codeblock>
+
+ <p>
+ <b>Column definitions inferred from data file:</b>
+ </p>
+
+<codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname>
+ LIKE PARQUET '<varname>hdfs_path_of_parquet_file</varname>'
+ [COMMENT '<varname>table_comment</varname>']
+ [PARTITIONED BY (<varname>col_name</varname> <varname>data_type</varname> [COMMENT '<varname>col_comment</varname>'], ...)]
+ [WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
+ [
+ [ROW FORMAT <varname>row_format</varname>] [STORED AS <varname>file_format</varname>]
+ ]
+ [LOCATION '<varname>hdfs_path</varname>']
+ [TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
+<ph rev="1.4.0"> [CACHED IN '<varname>pool_name</varname>'</ph> <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED]
+data_type:
+ <varname>primitive_type</varname>
+ | array_type
+ | map_type
+ | struct_type
+</codeblock>
+
+ <p>
+ <b>CREATE TABLE AS SELECT:</b>
+ </p>
+
+<codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] <varname>db_name</varname>.]<varname>table_name</varname>
+ [COMMENT '<varname>table_comment</varname>']
+ [WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
+ [
+ [ROW FORMAT <varname>row_format</varname>] [STORED AS <varname>file_format</varname>]
+ ]
+ [LOCATION '<varname>hdfs_path</varname>']
+ [TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
+<ph rev="1.4.0"> [CACHED IN '<varname>pool_name</varname>'</ph> <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED]
+AS
+ <varname>select_statement</varname></codeblock>
+
+<codeblock>primitive_type:
+ TINYINT
+ | SMALLINT
+ | INT
+ | BIGINT
+ | BOOLEAN
+ | FLOAT
+ | DOUBLE
+ <ph rev="1.4.0">| DECIMAL</ph>
+ | STRING
+ <ph rev="2.0.0">| CHAR</ph>
+ <ph rev="2.0.0">| VARCHAR</ph>
+ | TIMESTAMP
+
+<ph rev="2.3.0">complex_type:
+ struct_type
+ | array_type
+ | map_type
+
+struct_type: STRUCT < <varname>name</varname> : <varname>primitive_or_complex_type</varname> [COMMENT '<varname>comment_string</varname>'], ... >
+
+array_type: ARRAY < <varname>primitive_or_complex_type</varname> >
+
+map_type: MAP < <varname>primitive_type</varname>, <varname>primitive_or_complex_type</varname> >
+</ph>
+row_format:
+ DELIMITED [FIELDS TERMINATED BY '<varname>char</varname>' [ESCAPED BY '<varname>char</varname>']]
+ [LINES TERMINATED BY '<varname>char</varname>']
+
+file_format:
+ PARQUET
+ | TEXTFILE
+ | AVRO
+ | SEQUENCEFILE
+ | RCFILE
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/ddl_blurb"/>
+
+ <!-- Should really have some info up front about all the data types and file formats.
+ Consider adding here, or at least making inline links to the relevant keywords
+ in the syntax spec above. -->
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p rev="2.3.0">
+ The Impala complex types (<codeph>STRUCT</codeph>, <codeph>ARRAY</codeph>, or <codeph>MAP</codeph>)
+ are available in CDH 5.5 / Impala 2.3 and higher.
+ Because you can nest these types (for example, to make an array of maps or a struct
+ with an array field), these types are also sometimes referred to as nested types.
+ See <xref href="impala_complex_types.xml#complex_types"/> for usage details.
+ </p>
+
+ <!-- This is kind of an obscure and rare usage scenario. Consider moving all the complex type stuff further down
+ after some of the more common clauses. -->
+ <p rev="2.3.0">
+ Impala can create tables containing complex type columns, with any supported file format.
+ Because currently Impala can only query complex type columns in Parquet tables, creating
+ tables with complex type columns and other file formats such as text is of limited use.
+ For example, you might create a text table including some columns with complex types with Impala, and use Hive
+ as part of your to ingest the nested type data and copy it to an identical Parquet table.
+ Or you might create a partitioned table containing complex type columns using one file format, and
+ use <codeph>ALTER TABLE</codeph> to change the file format of individual partitions to Parquet; Impala
+ can then query only the Parquet-format partitions in that table.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_partitioning"/>
+
+ <p>
+ <b>Internal and external tables (EXTERNAL and LOCATION clauses):</b>
+ </p>
+
+ <p>
+ By default, Impala creates an <q>internal</q> table, where Impala manages the underlying data files for the
+ table, and physically deletes the data files when you drop the table. If you specify the
+ <codeph>EXTERNAL</codeph> clause, Impala treats the table as an <q>external</q> table, where the data files
+ are typically produced outside Impala and queried from their original locations in HDFS, and Impala leaves
+ the data files in place when you drop the table. For details about internal and external tables, see
+ <xref href="impala_tables.xml#tables"/>.
+ </p>
+
+ <p>
+ Typically, for an external table you include a <codeph>LOCATION</codeph> clause to specify the path to the
+ HDFS directory where Impala reads and writes files for the table. For example, if your data pipeline produces
+ Parquet files in the HDFS directory <filepath>/user/etl/destination</filepath>, you might create an external
+ table as follows:
+ </p>
+
+<codeblock>CREATE EXTERNAL TABLE external_parquet (c1 INT, c2 STRING, c3 TIMESTAMP)
+ STORED AS PARQUET LOCATION '/user/etl/destination';
+</codeblock>
+
+ <p>
+ Although the <codeph>EXTERNAL</codeph> and <codeph>LOCATION</codeph> clauses are often specified together,
+ <codeph>LOCATION</codeph> is optional for external tables, and you can also specify <codeph>LOCATION</codeph>
+ for internal tables. The difference is all about whether Impala <q>takes control</q> of the underlying data
+ files and moves them when you rename the table, or deletes them when you drop the table. For more about
+ internal and external tables and how they interact with the <codeph>LOCATION</codeph> attribute, see
+ <xref href="impala_tables.xml#tables"/>.
+ </p>
+
+ <p>
+ <b>Partitioned tables (PARTITIONED BY clause):</b>
+ </p>
+
+ <p>
+ The <codeph>PARTITIONED BY</codeph> clause divides the data files based on the values from one or more
+ specified columns. Impala queries can use the partition metadata to minimize the amount of data that is read
+ from disk or transmitted across the network, particularly during join queries. For details about
+ partitioning, see <xref href="impala_partitioning.xml#partitioning"/>.
+ </p>
+
+ <p rev="kudu" audience="impala_next">
+ <b>Partitioning for Kudu tables (DISTRIBUTE BY clause)</b>
+ </p>
+
+ <p rev="kudu" audience="impala_next">
+ For Kudu tables, you specify logical partitioning across one or more columns using the
+ <codeph>DISTRIBUTE BY</codeph> clause. In contrast to partitioning for HDFS-based tables,
+ multiple values for a partition key column can be located in the same partition.
+ The optional <codeph>HASH</codeph> clause lets you divide one or a set of partition key columns
+ into a specified number of buckets; you can use more than one <codeph>HASH</codeph>
+ clause, specifying a distinct set of partition key columns for each.
+ The optional <codeph>RANGE</codeph> clause further subdivides the partitions, based on
+ a set of literal values for the partition key columns.
+ </p>
+
+ <p>
+ <b>Specifying file format (STORED AS and ROW FORMAT clauses):</b>
+ </p>
+
+ <p>
+ The <codeph>STORED AS</codeph> clause identifies the format of the underlying data files. Currently, Impala
+ can query more types of file formats than it can create or insert into. Use Hive to perform any create or
+ data load operations that are not currently available in Impala. For example, Impala can create a
+ SequenceFile table but cannot insert data into it. There are also Impala-specific procedures for using
+ compression with each kind of file format. For details about working with data files of various formats, see
+ <xref href="impala_file_formats.xml#file_formats"/>.
+ </p>
+
+ <note>
+ In Impala 1.4.0 and higher, Impala can create Avro tables, which formerly required doing the <codeph>CREATE
+ TABLE</codeph> statement in Hive. See <xref href="impala_avro.xml#avro"/> for details and examples.
+ </note>
+
+ <p>
+ By default (when no <codeph>STORED AS</codeph> clause is specified), data files in Impala tables are created
+ as text files with Ctrl-A (hex 01) characters as the delimiter.
+<!-- Verify if ROW FORMAT is entirely ignored outside of text tables, or does it apply somehow to SequenceFile and/or RCFile too? -->
+ Specify the <codeph>ROW FORMAT DELIMITED</codeph> clause to produce or ingest data files that use a different
+ delimiter character such as tab or <codeph>|</codeph>, or a different line end character such as carriage
+ return or newline. When specifying delimiter and line end characters with the <codeph>FIELDS TERMINATED
+ BY</codeph> and <codeph>LINES TERMINATED BY</codeph> clauses, use <codeph>'\t'</codeph> for tab,
+ <codeph>'\n'</codeph> for newline or linefeed, <codeph>'\r'</codeph> for carriage return, and
+ <codeph>\</codeph><codeph>0</codeph> for ASCII <codeph>nul</codeph> (hex 00). For more examples of text
+ tables, see <xref href="impala_txtfile.xml#txtfile"/>.
+ </p>
+
+ <p>
+ The <codeph>ESCAPED BY</codeph> clause applies both to text files that you create through an
+ <codeph>INSERT</codeph> statement to an Impala <codeph>TEXTFILE</codeph> table, and to existing data files
+ that you put into an Impala table directory. (You can ingest existing data files either by creating the table
+ with <codeph>CREATE EXTERNAL TABLE ... LOCATION</codeph>, the <codeph>LOAD DATA</codeph> statement, or
+ through an HDFS operation such as <codeph>hdfs dfs -put <varname>file</varname>
+ <varname>hdfs_path</varname></codeph>.) Choose an escape character that is not used anywhere else in the
+ file, and put it in front of each instance of the delimiter character that occurs within a field value.
+ Surrounding field values with quotation marks does not help Impala to parse fields with embedded delimiter
+ characters; the quotation marks are considered to be part of the column value. If you want to use
+ <codeph>\</codeph> as the escape character, specify the clause in <cmdname>impala-shell</cmdname> as
+ <codeph>ESCAPED BY '\\'</codeph>.
+ </p>
+
+ <note conref="../shared/impala_common.xml#common/thorn"/>
+
+ <p>
+ <b>Cloning tables (LIKE clause):</b>
+ </p>
+
+ <p>
+ To create an empty table with the same columns, comments, and other attributes as another table, use the
+ following variation. The <codeph>CREATE TABLE ... LIKE</codeph> form allows a restricted set of clauses,
+ currently only the <codeph>LOCATION</codeph>, <codeph>COMMENT</codeph>, and <codeph>STORED AS</codeph>
+ clauses.
+ </p>
+
+<codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname>
+ <ph rev="1.4.0">LIKE { [<varname>db_name</varname>.]<varname>table_name</varname> | PARQUET '<varname>hdfs_path_of_parquet_file</varname>' }</ph>
+ [COMMENT '<varname>table_comment</varname>']
+ [STORED AS <varname>file_format</varname>]
+ [LOCATION '<varname>hdfs_path</varname>']</codeblock>
+
+ <note rev="1.2">
+ To clone the structure of a table and transfer data into it in a single operation, use the <codeph>CREATE
+ TABLE AS SELECT</codeph> syntax described in the next subsection.
+ </note>
+
+ <p>
+ When you clone the structure of an existing table using the <codeph>CREATE TABLE ... LIKE</codeph> syntax,
+ the new table keeps the same file format as the original one, so you only need to specify the <codeph>STORED
+ AS</codeph> clause if you want to use a different file format, or when specifying a view as the original
+ table. (Creating a table <q>like</q> a view produces a text table by default.)
+ </p>
+
+ <p>
+ Although normally Impala cannot create an HBase table directly, Impala can clone the structure of an existing
+ HBase table with the <codeph>CREATE TABLE ... LIKE</codeph> syntax, preserving the file format and metadata
+ from the original table.
+ </p>
+
+ <p>
+ There are some exceptions to the ability to use <codeph>CREATE TABLE ... LIKE</codeph> with an Avro table.
+ For example, you cannot use this technique for an Avro table that is specified with an Avro schema but no
+ columns. When in doubt, check if a <codeph>CREATE TABLE ... LIKE</codeph> operation works in Hive; if not, it
+ typically will not work in Impala either.
+ </p>
+
+ <p>
+ If the original table is partitioned, the new table inherits the same partition key columns. Because the new
+ table is initially empty, it does not inherit the actual partitions that exist in the original one. To create
+ partitions in the new table, insert data or issue <codeph>ALTER TABLE ... ADD PARTITION</codeph> statements.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/create_table_like_view"/>
+
+ <p>
+ Because <codeph>CREATE TABLE ... LIKE</codeph> only manipulates table metadata, not the physical data of the
+ table, issue <codeph>INSERT INTO TABLE</codeph> statements afterward to copy any data from the original table
+ into the new one, optionally converting the data to a new file format. (For some file formats, Impala can do
+ a <codeph>CREATE TABLE ... LIKE</codeph> to create the table, but Impala cannot insert data in that file
+ format; in these cases, you must load the data in Hive. See
+ <xref href="impala_file_formats.xml#file_formats"/> for details.)
+ </p>
+
+ <p rev="1.2" id="ctas">
+ <b>CREATE TABLE AS SELECT:</b>
+ </p>
+
+ <p>
+ The <codeph>CREATE TABLE AS SELECT</codeph> syntax is a shorthand notation to create a table based on column
+ definitions from another table, and copy data from the source table to the destination table without issuing
+ any separate <codeph>INSERT</codeph> statement. This idiom is so popular that it has its own acronym,
+ <q>CTAS</q>.
+<!--
+ The <codeph>CREATE TABLE AS SELECT</codeph> syntax is as follows:
+ -->
+ </p>
+
+<!-- CREATE TABLE AS <select> now incorporated up higher in the original syntax diagram,
+ thus commented out here.
+ Does CTAS only accept a limited subset of clauses? -->
+
+<!--
+<codeblock rev="1.2">CREATE [EXTERNAL] TABLE [IF NOT EXISTS] <varname>db_name</varname>.]<varname>table_name</varname>
+ [COMMENT '<varname>table_comment</varname>']
+ [STORED AS <varname>file_format</varname>]
+ [LOCATION '<varname>hdfs_path</varname>']
+AS
+ <varname>select_statement</varname></codeblock>
+-->
+
+ <p rev="1.2">
+ See <xref href="impala_select.xml#select"/> for details about query syntax for the <codeph>SELECT</codeph>
+ portion of a <codeph>CREATE TABLE AS SELECT</codeph> statement.
+ </p>
+
+ <p rev="1.2">
+ The newly created table inherits the column names that you select from the original table, which you can
+ override by specifying column aliases in the query. Any column or table comments from the original table are
+ not carried over to the new table.
+ </p>
+
+ <p rev="obwl" conref="../shared/impala_common.xml#common/insert_sort_blurb"/>
+
+ <p rev="1.2">
+ For example, the following statements show how you can clone all the data in a table, or a subset of the
+ columns and/or rows, or reorder columns, rename them, or construct them out of expressions:
+ </p>
+
+<codeblock rev="1.2">-- Create new table and copy all data.
+CREATE TABLE clone_of_t1 AS SELECT * FROM t1;
+-- Same idea as CREATE TABLE LIKE, don't copy any data.
+CREATE TABLE empty_clone_of_t1 AS SELECT * FROM t1 WHERE 1=0;
+-- Copy some data.
+CREATE TABLE subset_of_t1 AS SELECT * FROM t1 WHERE x > 100 AND y LIKE 'A%';
+CREATE TABLE summary_of_t1 AS SELECT c1, sum(c2) AS total, avg(c2) AS average FROM t1 GROUP BY c2;
+-- Switch file format.
+CREATE TABLE parquet_version_of_t1 STORED AS PARQUET AS SELECT * FROM t1;
+-- Create tables with different column order, names, or types than the original.
+CREATE TABLE some_columns_from_t1 AS SELECT c1, c3, c5 FROM t1;
+CREATE TABLE reordered_columns_from_t1 AS SELECT c4, c3, c1, c2 FROM t1;
+CREATE TABLE synthesized_columns AS SELECT upper(c1) AS all_caps, c2+c3 AS total, "California" AS state FROM t1;</codeblock>
+
+ <p rev="1.2">
+ As part of a CTAS operation, you can convert the data to any file format that Impala can write (currently,
+ <codeph>TEXTFILE</codeph> and <codeph>PARQUET</codeph>). You cannot specify the lower-level properties of a
+ text table, such as the delimiter. Although you can use a partitioned table as the source and copy data from
+ it, you cannot specify any partitioning clauses for the new table.
+ </p>
+
+ <p rev="1.4.0">
+ <b>CREATE TABLE LIKE PARQUET:</b>
+ </p>
+
+ <p rev="1.4.0">
+ The variation <codeph>CREATE TABLE ... LIKE PARQUET '<varname>hdfs_path_of_parquet_file</varname>'</codeph>
+ lets you skip the column definitions of the <codeph>CREATE TABLE</codeph> statement. The column names and
+ data types are automatically configured based on the organization of the specified Parquet data file, which
+ must already reside in HDFS. You can use a data file located outside the Impala database directories, or a
+ file from an existing Impala Parquet table; either way, Impala only uses the column definitions from the file
+ and does not use the HDFS location for the <codeph>LOCATION</codeph> attribute of the new table. (Although
+ you can also specify the enclosing directory with the <codeph>LOCATION</codeph> attribute, to both use the
+ same schema as the data file and point the Impala table at the associated directory for querying.)
+ </p>
+
+ <p rev="1.4.0">
+ The following considerations apply when you use the <codeph>CREATE TABLE LIKE PARQUET</codeph> technique:
+ </p>
+
+ <ul rev="1.4.0">
+ <li>
+ Any column comments from the original table are not preserved in the new table. Each column in the new
+ table has a comment stating the low-level Parquet field type used to deduce the appropriate SQL column
+ type.
+ </li>
+
+ <li>
+ If you use a data file from a partitioned Impala table, any partition key columns from the original table
+ are left out of the new table, because they are represented in HDFS directory names rather than stored in
+ the data file. To preserve the partition information, repeat the same <codeph>PARTITION</codeph> clause as
+ in the original <codeph>CREATE TABLE</codeph> statement.
+ </li>
+
+ <li>
+ The file format of the new table defaults to text, as with other kinds of <codeph>CREATE TABLE</codeph>
+ statements. To make the new table also use Parquet format, include the clause <codeph>STORED AS
+ PARQUET</codeph> in the <codeph>CREATE TABLE LIKE PARQUET</codeph> statement.
+ </li>
+
+ <li>
+ If the Parquet data file comes from an existing Impala table, currently, any <codeph>TINYINT</codeph> or
+ <codeph>SMALLINT</codeph> columns are turned into <codeph>INT</codeph> columns in the new table.
+ Internally, Parquet stores such values as 32-bit integers.
+ </li>
+
+ <li>
+ When the destination table uses the Parquet file format, the <codeph>CREATE TABLE AS SELECT</codeph> and
+ <codeph>INSERT ... SELECT</codeph> statements always create at least one data file, even if the
+ <codeph>SELECT</codeph> part of the statement does not match any rows. You can use such an empty Parquet
+ data file as a template for subsequent <codeph>CREATE TABLE LIKE PARQUET</codeph> statements.
+ </li>
+ </ul>
+
+ <p>
+ For more details about creating Parquet tables, and examples of the <codeph>CREATE TABLE LIKE
+ PARQUET</codeph> syntax, see <xref href="impala_parquet.xml#parquet"/>.
+ </p>
+
+ <p>
+ <b>Visibility and Metadata (TBLPROPERTIES and WITH SERDEPROPERTIES clauses):</b>
+ </p>
+
+ <p rev="1.2">
+ You can associate arbitrary items of metadata with a table by specifying the <codeph>TBLPROPERTIES</codeph>
+ clause. This clause takes a comma-separated list of key-value pairs and stores those items in the metastore
+ database. You can also change the table properties later with an <codeph>ALTER TABLE</codeph> statement. You
+ can observe the table properties for different delimiter and escape characters using the <codeph>DESCRIBE
+ FORMATTED</codeph> command, and change those settings for an existing table with <codeph>ALTER TABLE ... SET
+ TBLPROPERTIES</codeph>.
+ </p>
+
+ <p rev="1.2">
+ You can also associate SerDes properties with the table by specifying key-value pairs through the
+ <codeph>WITH SERDEPROPERTIES</codeph> clause. This metadata is not used by Impala, which has its own built-in
+ serializer and deserializer for the file formats it supports. Particular property values might be needed for
+ Hive compatibility with certain variations of file formats, particularly Avro.
+ </p>
+
+ <p>
+ Some DDL operations that interact with other Hadoop components require specifying particular values in the
+ <codeph>SERDEPROPERTIES</codeph> or <codeph>TBLPROPERTIES</codeph> fields, such as creating an Avro table or
+ an HBase table. (You typically create HBase tables in Hive, because they require additional clauses not
+ currently available in Impala.)
+<!-- Haven't got a working example from Lenni, so suppressing this recommendation for now.
+ The Avro schema properties can be specified through either
+ <codeph>TBLPROPERTIES</codeph> or <codeph>SERDEPROPERTIES</codeph>;
+ for best compatibility with future versions of Hive,
+ use <codeph>SERDEPROPERTIES</codeph> in this case.
+-->
+ </p>
+
+ <p>
+ To see the column definitions and column comments for an existing table, for example before issuing a
+ <codeph>CREATE TABLE ... LIKE</codeph> or a <codeph>CREATE TABLE ... AS SELECT</codeph> statement, issue the
+ statement <codeph>DESCRIBE <varname>table_name</varname></codeph>. To see even more detail, such as the
+ location of data files and the values for clauses such as <codeph>ROW FORMAT</codeph> and <codeph>STORED
+ AS</codeph>, issue the statement <codeph>DESCRIBE FORMATTED <varname>table_name</varname></codeph>.
+ <codeph>DESCRIBE FORMATTED</codeph> is also needed to see any overall table comment (as opposed to individual
+ column comments).
+ </p>
+
+ <p>
+ After creating a table, your <cmdname>impala-shell</cmdname> session or another
+ <cmdname>impala-shell</cmdname> connected to the same node can immediately query that table. There might be a
+ brief interval (one statestore heartbeat) before the table can be queried through a different Impala node. To
+ make the <codeph>CREATE TABLE</codeph> statement return only when the table is recognized by all Impala nodes
+ in the cluster, enable the <codeph>SYNC_DDL</codeph> query option.
+ </p>
+
+ <p rev="1.4.0">
+ <b>HDFS caching (CACHED IN clause):</b>
+ </p>
+
+ <p rev="1.4.0">
+ If you specify the <codeph>CACHED IN</codeph> clause, any existing or future data files in the table
+ directory or the partition subdirectories are designated to be loaded into memory with the HDFS caching
+ mechanism. See <xref href="impala_perf_hdfs_caching.xml#hdfs_caching"/> for details about using the HDFS
+ caching feature.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/impala_cache_replication_factor"/>
+
+<!-- Say something in here about the SHOW statement, e.g. SHOW TABLES, SHOW TABLE/COLUMN STATS, SHOW PARTITIONS. -->
+
+ <p>
+ <b>Column order</b>:
+ </p>
+
+ <p>
+ If you intend to use the table to hold data files produced by some external source, specify the columns in
+ the same order as they appear in the data files.
+ </p>
+
+ <p>
+ If you intend to insert or copy data into the table through Impala, or if you have control over the way
+ externally produced data files are arranged, use your judgment to specify columns in the most convenient
+ order:
+ </p>
+
+ <ul>
+ <li>
+ <p>
+ If certain columns are often <codeph>NULL</codeph>, specify those columns last. You might produce data
+ files that omit these trailing columns entirely. Impala automatically fills in the <codeph>NULL</codeph>
+ values if so.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ If an unpartitioned table will be used as the source for an <codeph>INSERT ... SELECT</codeph> operation
+ into a partitioned table, specify last in the unpartitioned table any columns that correspond to
+ partition key columns in the partitioned table, and in the same order as the partition key columns are
+ declared in the partitioned table. This technique lets you use <codeph>INSERT ... SELECT *</codeph> when
+ copying data to the partitioned table, rather than specifying each column name individually.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ If you specify columns in an order that you later discover is suboptimal, you can sometimes work around
+ the problem without recreating the table. You can create a view that selects columns from the original
+ table in a permuted order, then do a <codeph>SELECT *</codeph> from the view. When inserting data into a
+ table, you can specify a permuted order for the inserted columns to match the order in the destination
+ table.
+ </p>
+ </li>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/hive_blurb"/>
+
+ <p>
+ Impala queries can make use of metadata about the table and columns, such as the number of rows in a table or
+ the number of different values in a column. Prior to Impala 1.2.2, to create this metadata, you issued the
+ <codeph>ANALYZE TABLE</codeph> statement in Hive to gather this information, after creating the table and
+ loading representative data into it. In Impala 1.2.2 and higher, the <codeph>COMPUTE STATS</codeph> statement
+ produces these statistics within Impala, without needing to use Hive at all.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/hbase_blurb"/>
+
+ <note>
+ <p>
+ The Impala <codeph>CREATE TABLE</codeph> statement cannot create an HBase table, because it currently does
+ not support the <codeph>STORED BY</codeph> clause needed for HBase tables. Create such tables in Hive, then
+ query them through Impala. For information on using Impala with HBase tables, see
+ <xref href="impala_hbase.xml#impala_hbase"/>.
+ </p>
+ </note>
+
+ <p conref="../shared/impala_common.xml#common/s3_blurb"/>
+ <p rev="2.2.0">
+ To create a table where the data resides in the Amazon Simple Storage Service (S3),
+ specify a <codeph>s3a://</codeph> prefix <codeph>LOCATION</codeph> attribute pointing to the data files in S3.
+ You can use this special <codeph>LOCATION</codeph> syntax when creating an empty table,
+ but not as part of a <codeph>CREATE TABLE AS SELECT</codeph> statement.
+ See <xref href="impala_s3.xml#s3"/> for details.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/insert_sort_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/hdfs_blurb"/>
+
+ <p>
+ The <codeph>CREATE TABLE</codeph> statement for an internal table creates a directory in HDFS. The
+ <codeph>CREATE EXTERNAL TABLE</codeph> statement associates the table with an existing HDFS directory, and
+ does not create any new directory in HDFS. To locate the HDFS data directory for a table, issue a
+ <codeph>DESCRIBE FORMATTED <varname>table</varname></codeph> statement. To examine the contents of that HDFS
+ directory, use an OS command such as <codeph>hdfs dfs -ls hdfs://<varname>path</varname></codeph>, either
+ from the OS command line or through the <codeph>shell</codeph> or <codeph>!</codeph> commands in
+ <cmdname>impala-shell</cmdname>.
+ </p>
+
+ <p>
+ The <codeph>CREATE TABLE AS SELECT</codeph> syntax creates data files under the table data directory to hold
+ any data copied by the <codeph>INSERT</codeph> portion of the statement. (Even if no data is copied, Impala
+ might create one or more empty data files.)
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
+ <p rev="CDH-19187">
+ <!-- TBD. -->
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/security_blurb"/>
+ <p conref="../shared/impala_common.xml#common/redaction_yes"/>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_maybe"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_tables.xml#tables"/>,
+ <xref href="impala_alter_table.xml#alter_table"/>, <xref href="impala_drop_table.xml#drop_table"/>,
+ <xref href="impala_partitioning.xml#partitioning"/>, <xref href="impala_tables.xml#internal_tables"/>,
+ <xref href="impala_tables.xml#external_tables"/>, <xref href="impala_compute_stats.xml#compute_stats"/>,
+ <xref href="impala_sync_ddl.xml#sync_ddl"/>, <xref href="impala_show.xml#show_tables"/>,
+ <xref href="impala_show.xml#show_create_table"/>, <xref href="impala_describe.xml#describe"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_create_view.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_create_view.xml b/docs/topics/impala_create_view.xml
new file mode 100644
index 0000000..2458279
--- /dev/null
+++ b/docs/topics/impala_create_view.xml
@@ -0,0 +1,136 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.1" id="create_view">
+
+ <title>CREATE VIEW Statement</title>
+ <titlealts><navtitle>CREATE VIEW</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="DDL"/>
+ <data name="Category" value="Tables"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">CREATE VIEW statement</indexterm>
+ The <codeph>CREATE VIEW</codeph> statement lets you create a shorthand abbreviation for a more complicated
+ query. The base query can involve joins, expressions, reordered columns, column aliases, and other SQL
+ features that can make a query hard to understand or maintain.
+ </p>
+
+ <p>
+ Because a view is purely a logical construct (an alias for a query) with no physical data behind it,
+ <codeph>ALTER VIEW</codeph> only involves changes to metadata in the metastore database, not any data files
+ in HDFS.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>CREATE VIEW [IF NOT EXISTS] <varname>view_name</varname> [(<varname>column_list</varname>)]
+ AS <varname>select_statement</varname></codeblock>
+
+ <p conref="../shared/impala_common.xml#common/ddl_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ The <codeph>CREATE VIEW</codeph> statement can be useful in scenarios such as the following:
+ </p>
+
+ <ul>
+ <li>
+ To turn even the most lengthy and complicated SQL query into a one-liner. You can issue simple queries
+ against the view from applications, scripts, or interactive queries in <cmdname>impala-shell</cmdname>.
+ For example:
+<codeblock>select * from <varname>view_name</varname>;
+select * from <varname>view_name</varname> order by c1 desc limit 10;</codeblock>
+ The more complicated and hard-to-read the original query, the more benefit there is to simplifying the
+ query using a view.
+ </li>
+
+ <li>
+ To hide the underlying table and column names, to minimize maintenance problems if those names change. In
+ that case, you re-create the view using the new names, and all queries that use the view rather than the
+ underlying tables keep running with no changes.
+ </li>
+
+ <li>
+ To experiment with optimization techniques and make the optimized queries available to all applications.
+ For example, if you find a combination of <codeph>WHERE</codeph> conditions, join order, join hints, and so
+ on that works the best for a class of queries, you can establish a view that incorporates the
+ best-performing techniques. Applications can then make relatively simple queries against the view, without
+ repeating the complicated and optimized logic over and over. If you later find a better way to optimize the
+ original query, when you re-create the view, all the applications immediately take advantage of the
+ optimized base query.
+ </li>
+
+ <li>
+ To simplify a whole class of related queries, especially complicated queries involving joins between
+ multiple tables, complicated expressions in the column list, and other SQL syntax that makes the query
+ difficult to understand and debug. For example, you might create a view that joins several tables, filters
+ using several <codeph>WHERE</codeph> conditions, and selects several columns from the result set.
+ Applications might issue queries against this view that only vary in their <codeph>LIMIT</codeph>,
+ <codeph>ORDER BY</codeph>, and similar simple clauses.
+ </li>
+ </ul>
+
+ <p>
+ For queries that require repeating complicated clauses over and over again, for example in the select list,
+ <codeph>ORDER BY</codeph>, and <codeph>GROUP BY</codeph> clauses, you can use the <codeph>WITH</codeph>
+ clause as an alternative to creating a view.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+ <p conref="../shared/impala_common.xml#common/complex_types_views"/>
+ <p conref="../shared/impala_common.xml#common/complex_types_views_caveat"/>
+
+ <p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/security_blurb"/>
+ <p conref="../shared/impala_common.xml#common/redaction_yes"/>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<!-- TK: Elaborate on these, show queries and real output. -->
+
+<codeblock>-- Create a view that is exactly the same as the underlying table.
+create view v1 as select * from t1;
+
+-- Create a view that includes only certain columns from the underlying table.
+create view v2 as select c1, c3, c7 from t1;
+
+-- Create a view that filters the values from the underlying table.
+create view v3 as select distinct c1, c3, c7 from t1 where c1 is not null and c5 > 0;
+
+-- Create a view that that reorders and renames columns from the underlying table.
+create view v4 as select c4 as last_name, c6 as address, c2 as birth_date from t1;
+
+-- Create a view that runs functions to convert or transform certain columns.
+create view v5 as select c1, cast(c3 as string) c3, concat(c4,c5) c5, trim(c6) c6, "Constant" c8 from t1;
+
+-- Create a view that hides the complexity of a view query.
+create view v6 as select t1.c1, t2.c2 from t1 join t2 on t1.id = t2.id;
+</codeblock>
+
+<!-- These examples show CREATE VIEW and corresponding DROP VIEW statements, with different combinations
+ of qualified and unqualified names. -->
+
+ <p conref="../shared/impala_common.xml#common/create_drop_view_examples"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_views.xml#views"/>, <xref href="impala_alter_view.xml#alter_view"/>,
+ <xref href="impala_drop_view.xml#drop_view"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_databases.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_databases.xml b/docs/topics/impala_databases.xml
new file mode 100644
index 0000000..ad0511f
--- /dev/null
+++ b/docs/topics/impala_databases.xml
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="databases">
+
+ <title>Overview of Impala Databases</title>
+ <titlealts><navtitle>Databases</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Databases"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ In Impala, a database is a logical container for a group of tables. Each database defines a separate
+ namespace. Within a database, you can refer to the tables inside it using their unqualified names. Different
+ databases can contain tables with identical names.
+ </p>
+
+ <p>
+ Creating a database is a lightweight operation. There are minimal database-specific properties to configure,
+ only <codeph>LOCATION</codeph> and <codeph>COMMENT</codeph>. There is no <codeph>ALTER DATABASE</codeph> statement.
+ </p>
+
+ <p>
+ Typically, you create a separate database for each project or application, to avoid naming conflicts between
+ tables and to make clear which tables are related to each other. The <codeph>USE</codeph> statement lets
+ you switch between databases. Unqualified references to tables, views, and functions refer to objects
+ within the current database. You can also refer to objects in other databases by using qualified names
+ of the form <codeph><varname>dbname</varname>.<varname>object_name</varname></codeph>.
+ </p>
+
+ <p>
+ Each database is physically represented by a directory in HDFS. When you do not specify a <codeph>LOCATION</codeph>
+ attribute, the directory is located in the Impala data directory with the associated tables managed by Impala.
+ When you do specify a <codeph>LOCATION</codeph> attribute, any read and write operations for tables in that
+ database are relative to the specified HDFS directory.
+ </p>
+
+ <p>
+ There is a special database, named <codeph>default</codeph>, where you begin when you connect to Impala.
+ Tables created in <codeph>default</codeph> are physically located one level higher in HDFS than all the
+ user-created databases.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/builtins_db"/>
+
+ <p>
+ <b>Related statements:</b>
+ </p>
+
+ <p>
+ <xref href="impala_create_database.xml#create_database"/>,
+ <xref href="impala_drop_database.xml#drop_database"/>, <xref href="impala_use.xml#use"/>,
+ <xref href="impala_show.xml#show_databases"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_datatypes.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_datatypes.xml b/docs/topics/impala_datatypes.xml
new file mode 100644
index 0000000..e45867e
--- /dev/null
+++ b/docs/topics/impala_datatypes.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="datatypes">
+
+ <title>Data Types</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Data Types"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">data types</indexterm>
+ Impala supports a set of data types that you can use for table columns, expression values, and function
+ arguments and return values.
+ </p>
+
+ <note>
+ Currently, Impala supports only scalar types, not composite or nested types. Accessing a table containing any
+ columns with unsupported types causes an error.
+ </note>
+
+ <p outputclass="toc"/>
+
+ <p>
+ For the notation to write literals of each of these data types, see
+ <xref href="impala_literals.xml#literals"/>.
+ </p>
+
+ <p>
+ See <xref href="impala_langref_unsupported.xml#langref_hiveql_delta"/> for differences between Impala and
+ Hive data types.
+ </p>
+ </conbody>
+</concept>
[14/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Posted by jr...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_datetime_functions.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_datetime_functions.xml b/docs/topics/impala_datetime_functions.xml
new file mode 100644
index 0000000..16ae088
--- /dev/null
+++ b/docs/topics/impala_datetime_functions.xml
@@ -0,0 +1,1505 @@
+<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="datetime_functions">
+
+ <title>Impala Date and Time Functions</title>
+ <titlealts><navtitle>Date and Time Functions</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Dates and Times"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ The underlying Impala data type for date and time data is
+ <codeph><xref href="impala_timestamp.xml#timestamp">TIMESTAMP</xref></codeph>, which has both a date and a
+ time portion. Functions that extract a single field, such as <codeph>hour()</codeph> or
+ <codeph>minute()</codeph>, typically return an integer value. Functions that format the date portion, such as
+ <codeph>date_add()</codeph> or <codeph>to_date()</codeph>, typically return a string value.
+ </p>
+
+ <p>
+ You can also adjust a <codeph>TIMESTAMP</codeph> value by adding or subtracting an <codeph>INTERVAL</codeph>
+ expression. See <xref href="impala_timestamp.xml#timestamp"/> for details. <codeph>INTERVAL</codeph>
+ expressions are also allowed as the second argument for the <codeph>date_add()</codeph> and
+ <codeph>date_sub()</codeph> functions, rather than integers.
+ </p>
+
+ <p rev="2.2.0">
+ Some of these functions are affected by the setting of the
+ <codeph>-use_local_tz_for_unix_timestamp_conversions</codeph> startup flag for the
+ <cmdname>impalad</cmdname> daemon. This setting is off by default, meaning that
+ functions such as <codeph>from_unixtime()</codeph> and <codeph>unix_timestamp()</codeph>
+ consider the input values to always represent the UTC time zone.
+ This setting also applies when you <codeph>CAST()</codeph> a <codeph>BIGINT</codeph>
+ value to <codeph>TIMESTAMP</codeph>, or a <codeph>TIMESTAMP</codeph>
+ value to <codeph>BIGINT</codeph>.
+ When this setting is enabled, these functions and operations convert to and from
+ values representing the local time zone.
+ See <xref href="impala_timestamp.xml#timestamp"/> for details about how
+ Impala handles time zone considerations for the <codeph>TIMESTAMP</codeph> data type.
+ </p>
+
+ <p>
+ <b>Function reference:</b>
+ </p>
+
+ <p>
+ Impala supports the following data and time functions:
+ </p>
+
+<!-- New for 2.3:
+int_months_between
+timeofday
+timestamp_cmp
+months_between
+-->
+
+ <dl>
+ <dlentry rev="1.4.0" id="add_months">
+
+ <dt>
+ <codeph>add_months(timestamp date, int months)</codeph>, <codeph>add_months(timestamp date, bigint
+ months)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">add_months() function</indexterm>
+ <b>Purpose:</b> Returns the specified date and time plus some number of months.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ Same as <codeph>months_add()</codeph>. Available in Impala 1.4 and higher. For
+ compatibility when porting code with vendor extensions.
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="adddate">
+
+ <dt>
+ <codeph>adddate(timestamp startdate, int days)</codeph>, <codeph>adddate(timestamp startdate, bigint
+ days)</codeph>,
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">adddate() function</indexterm>
+ <b>Purpose:</b> Adds a specified number of days to a <codeph>TIMESTAMP</codeph> value. Similar to
+ <codeph>date_add()</codeph>, but starts with an actual <codeph>TIMESTAMP</codeph> value instead of a
+ string that is converted to a <codeph>TIMESTAMP</codeph>.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="current_timestamp">
+
+ <dt>
+ <codeph>current_timestamp()</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">current_timestamp() function</indexterm>
+ <b>Purpose:</b> Alias for the <codeph>now()</codeph> function.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="date_add">
+
+ <dt>
+ <codeph>date_add(timestamp startdate, int days)</codeph>, <codeph>date_add(timestamp startdate,
+ <varname>interval_expression</varname>)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">date_add() function</indexterm>
+ <b>Purpose:</b> Adds a specified number of days to a <codeph>TIMESTAMP</codeph> value. The first argument
+ can be a string, which is automatically cast to <codeph>TIMESTAMP</codeph> if it uses the recognized
+ format, as described in <xref href="impala_timestamp.xml#timestamp"/>. With an <codeph>INTERVAL</codeph>
+ expression as the second argument, you can calculate a delta value using other units such as weeks,
+ years, hours, seconds, and so on; see <xref href="impala_timestamp.xml#timestamp"/> for details.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="2.0.0" id="date_part">
+
+ <dt>
+ <codeph>date_part(string, timestamp)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">date_part() function</indexterm>
+ <b>Purpose:</b> Similar to <codeph>EXTRACT()</codeph>, with the argument order reversed. Supports the
+ same date and time units as <codeph>EXTRACT()</codeph>. For compatibility with SQL code containing vendor
+ extensions.
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="date_sub">
+
+ <dt>
+ <codeph>date_sub(timestamp startdate, int days)</codeph>, <codeph>date_sub(timestamp startdate,
+ <varname>interval_expression</varname>)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">date_sub() function</indexterm>
+ <b>Purpose:</b> Subtracts a specified number of days from a <codeph>TIMESTAMP</codeph> value. The first
+ argument can be a string, which is automatically cast to <codeph>TIMESTAMP</codeph> if it uses the
+ recognized format, as described in <xref href="impala_timestamp.xml#timestamp"/>. With an
+ <codeph>INTERVAL</codeph> expression as the second argument, you can calculate a delta value using other
+ units such as weeks, years, hours, seconds, and so on; see <xref href="impala_timestamp.xml#timestamp"/>
+ for details.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="datediff">
+
+ <dt>
+ <codeph>datediff(string enddate, string startdate)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">datediff() function</indexterm>
+ <b>Purpose:</b> Returns the number of days between two dates represented as strings.
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="day">
+
+ <dt>
+ <!-- <codeph>day(string date), <ph id="dayofmonth">dayofmonth(string date)</ph></codeph> -->
+ <codeph>day(timestamp date), <ph id="dayofmonth">dayofmonth(timestamp date)</ph></codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">day() function</indexterm>
+ <b>Purpose:</b> Returns the day field from the date portion of a <codeph>TIMESTAMP</codeph>.
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.2" id="dayname">
+
+ <dt>
+ <!-- <codeph>dayname(string date)</codeph> -->
+ <codeph>dayname(timestamp date)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">dayname() function</indexterm>
+ <b>Purpose:</b> Returns the day field from a date represented as a string, converted to the string
+ corresponding to that day name. The range of return values is <codeph>'Sunday'</codeph> to
+ <codeph>'Saturday'</codeph>. Used in report-generating queries, as an alternative to calling
+ <codeph>dayofweek()</codeph> and turning that numeric return value into a string using a
+ <codeph>CASE</codeph> expression.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.1" id="dayofweek">
+
+ <dt>
+ <!-- <codeph>dayofweek(string date)</codeph> -->
+ <codeph>dayofweek(timestamp date)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">dayofweek() function</indexterm>
+ <b>Purpose:</b> Returns the day field from the date portion of a <codeph>TIMESTAMP</codeph>, corresponding to the day of
+ the week. The range of return values is 1 (Sunday) to 7 (Saturday).
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="dayofyear">
+
+ <dt>
+ <codeph>dayofyear(timestamp date)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">dayofyear() function</indexterm>
+ <b>Purpose:</b> Returns the day field from a <codeph>TIMESTAMP</codeph> value, corresponding to the day
+ of the year. The range of return values is 1 (January 1) to 366 (December 31 of a leap year).
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="days_add">
+
+ <dt>
+ <codeph>days_add(timestamp startdate, int days)</codeph>, <codeph>days_add(timestamp startdate, bigint
+ days)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">days_add() function</indexterm>
+ <b>Purpose:</b> Adds a specified number of days to a <codeph>TIMESTAMP</codeph> value. Similar to
+ <codeph>date_add()</codeph>, but starts with an actual <codeph>TIMESTAMP</codeph> value instead of a
+ string that is converted to a <codeph>TIMESTAMP</codeph>.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="days_sub">
+
+ <dt>
+ <codeph>days_sub(timestamp startdate, int days)</codeph>, <codeph>days_sub(timestamp startdate, bigint
+ days)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">days_sub() function</indexterm>
+ <b>Purpose:</b> Subtracts a specified number of days from a <codeph>TIMESTAMP</codeph> value. Similar to
+ <codeph>date_sub()</codeph>, but starts with an actual <codeph>TIMESTAMP</codeph> value instead of a
+ string that is converted to a <codeph>TIMESTAMP</codeph>.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.4.0" id="extract">
+
+ <dt>
+ <codeph>extract(timestamp, string unit)</codeph><codeph rev="2.0.0">extract(unit FROM timestamp)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">extract() function</indexterm>
+ <b>Purpose:</b> Returns one of the numeric date or time fields from a <codeph>TIMESTAMP</codeph> value.
+ <p>
+ <b>Unit argument:</b> The <codeph>unit</codeph> string can be one of <codeph>year</codeph>,
+ <codeph>month</codeph>, <codeph>day</codeph>, <codeph>hour</codeph>, <codeph>minute</codeph>,
+ <codeph>second</codeph>, or <codeph>millisecond</codeph>. This argument value is case-insensitive.
+ </p>
+ <p rev="2.0.0">
+ In Impala 2.0 and higher, you can use special syntax rather than a regular function call, for
+ compatibility with code that uses the SQL-99 format with the <codeph>FROM</codeph> keyword. With this
+ style, the unit names are identifiers rather than <codeph>STRING</codeph> literals. For example, the
+ following calls are both equivalent:
+<codeblock>extract(year from now());
+extract(now(), "year");
+</codeblock>
+ </p>
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ Typically used in <codeph>GROUP BY</codeph> queries to arrange results by hour,
+ day, month, and so on. You can also use this function in an <codeph>INSERT ... SELECT</codeph> into a
+ partitioned table to split up <codeph>TIMESTAMP</codeph> values into individual parts, if the
+ partitioned table has separate partition key columns representing year, month, day, and so on. If you
+ need to divide by more complex units of time, such as by week or by quarter, use the
+ <codeph>TRUNC()</codeph> function instead.
+ </p>
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="from_unixtime">
+
+ <dt>
+ <codeph>from_unixtime(bigint unixtime[, string format])</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">from_unixtime() function</indexterm>
+ <b>Purpose:</b> Converts the number of seconds from the Unix epoch to the specified time into a string in
+ the local time zone.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ <p conref="../shared/impala_common.xml#common/y2k38"/>
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ The format string accepts the variations allowed for the <codeph>TIMESTAMP</codeph>
+ data type: date plus time, date by itself, time by itself, and optional fractional seconds for the
+ time. See <xref href="impala_timestamp.xml#timestamp"/> for details.
+ </p>
+ <p rev="1.3.0">
+ Currently, the format string is case-sensitive, especially to distinguish <codeph>m</codeph> for
+ minutes and <codeph>M</codeph> for months. In Impala 1.3 and later, you can switch the order of
+ elements, use alternative separator characters, and use a different number of placeholders for each
+ unit. Adding more instances of <codeph>y</codeph>, <codeph>d</codeph>, <codeph>H</codeph>, and so on
+ produces output strings zero-padded to the requested number of characters. The exception is
+ <codeph>M</codeph> for months, where <codeph>M</codeph> produces a non-padded value such as
+ <codeph>3</codeph>, <codeph>MM</codeph> produces a zero-padded value such as <codeph>03</codeph>,
+ <codeph>MMM</codeph> produces an abbreviated month name such as <codeph>Mar</codeph>, and sequences of
+ 4 or more <codeph>M</codeph> are not allowed. A date string including all fields could be
+ <codeph>"yyyy-MM-dd HH:mm:ss.SSSSSS"</codeph>, <codeph>"dd/MM/yyyy HH:mm:ss.SSSSSS"</codeph>,
+ <codeph>"MMM dd, yyyy HH.mm.ss (SSSSSS)"</codeph> or other combinations of placeholders and separator
+ characters.
+ </p>
+ <p conref="../shared/impala_common.xml#common/timezone_conversion_caveat"/>
+ <note rev="1.3.0">
+ The more flexible format strings allowed with the built-in functions do not change the rules about
+ using <codeph>CAST()</codeph> to convert from a string to a <codeph>TIMESTAMP</codeph> value. Strings
+ being converted through <codeph>CAST()</codeph> must still have the elements in the specified order and use the specified delimiter
+ characters, as described in <xref href="impala_timestamp.xml#timestamp"/>.
+ </note>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+<codeblock>[localhost:21000] > select from_unixtime(1392394861,"yyyy-MM-dd HH:mm:ss.SSSS");
++-------------------------------------------------------+
+| from_unixtime(1392394861, 'yyyy-mm-dd hh:mm:ss.ssss') |
++-------------------------------------------------------+
+| 2014-02-14 16:21:01.0000 |
++-------------------------------------------------------+
+[localhost:21000] > select from_unixtime(1392394861,"yyyy-MM-dd");
++-----------------------------------------+
+| from_unixtime(1392394861, 'yyyy-mm-dd') |
++-----------------------------------------+
+| 2014-02-14 |
++-----------------------------------------+
+[localhost:21000] > select from_unixtime(1392394861,"HH:mm:ss.SSSS");
++--------------------------------------------+
+| from_unixtime(1392394861, 'hh:mm:ss.ssss') |
++--------------------------------------------+
+| 16:21:01.0000 |
++--------------------------------------------+
+[localhost:21000] > select from_unixtime(1392394861,"HH:mm:ss");
++---------------------------------------+
+| from_unixtime(1392394861, 'hh:mm:ss') |
++---------------------------------------+
+| 16:21:01 |
++---------------------------------------+</codeblock>
+ <p conref="../shared/impala_common.xml#common/datetime_function_chaining"/>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="from_utc_timestamp">
+
+ <dt>
+ <codeph>from_utc_timestamp(timestamp, string timezone)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">from_utc_timestamp() function</indexterm>
+ <b>Purpose:</b> Converts a specified UTC timestamp value into the appropriate value for a specified time
+ zone.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ <p>
+ <b>Usage notes:</b> Often used to translate UTC time zone data stored in a table back to the local
+ date and time for reporting. The opposite of the <codeph>to_utc_timestamp()</codeph> function.
+ </p>
+ <p>
+ <b>Examples:</b> See discussion of time zones in <xref href="impala_timestamp.xml#timestamp"/>
+ for information about using this function for conversions between the local time zone and UTC.
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="hour">
+
+ <dt>
+ <codeph>hour(string date)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">hour() function</indexterm>
+ <b>Purpose:</b> Returns the hour field from a date represented as a string.
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="hours_add">
+
+ <dt>
+ <codeph>hours_add(timestamp date, int hours)</codeph>, <codeph>hours_add(timestamp date, bigint
+ hours)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">hours_add() function</indexterm>
+ <b>Purpose:</b> Returns the specified date and time plus some number of hours.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="hours_sub">
+
+ <dt>
+ <codeph>hours_sub(timestamp date, int hours)</codeph>, <codeph>hours_sub(timestamp date, bigint
+ hours)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">hours_sub() function</indexterm>
+ <b>Purpose:</b> Returns the specified date and time minus some number of hours.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="2.3.0" id="int_months_between">
+
+ <dt>
+ <codeph>int_months_between(timestamp newer, timestamp older)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">int_months_between() function</indexterm>
+ <b>Purpose:</b> Returns the number of months between the date portions of two <codeph>TIMESTAMP</codeph> values,
+ as an <codeph>INT</codeph> representing only the full months that passed.
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ Typically used in business contexts, for example to determine whether
+ a specified number of months have passed or whether some end-of-month deadline was reached.
+ </p>
+ <p>
+ The method of determining the number of elapsed months includes some special handling of
+ months with different numbers of days that creates edge cases for dates between the
+ 28th and 31st days of certain months. See <codeph>months_between()</codeph> for details.
+ The <codeph>int_months_between()</codeph> result is essentially the <codeph>floor()</codeph>
+ of the <codeph>months_between()</codeph> result.
+ </p>
+ <p>
+ If either value is <codeph>NULL</codeph>, which could happen for example when converting a
+ nonexistent date string such as <codeph>'2015-02-29'</codeph> to a <codeph>TIMESTAMP</codeph>,
+ the result is also <codeph>NULL</codeph>.
+ </p>
+ <p>
+ If the first argument represents an earlier time than the second argument, the result is negative.
+ </p>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>/* Less than a full month = 0. */
+select int_months_between('2015-02-28', '2015-01-29');
++------------------------------------------------+
+| int_months_between('2015-02-28', '2015-01-29') |
++------------------------------------------------+
+| 0 |
++------------------------------------------------+
+
+/* Last day of month to last day of next month = 1. */
+select int_months_between('2015-02-28', '2015-01-31');
++------------------------------------------------+
+| int_months_between('2015-02-28', '2015-01-31') |
++------------------------------------------------+
+| 1 |
++------------------------------------------------+
+
+/* Slightly less than 2 months = 1. */
+select int_months_between('2015-03-28', '2015-01-31');
++------------------------------------------------+
+| int_months_between('2015-03-28', '2015-01-31') |
++------------------------------------------------+
+| 1 |
++------------------------------------------------+
+
+/* 2 full months (identical days of the month) = 2. */
+select int_months_between('2015-03-31', '2015-01-31');
++------------------------------------------------+
+| int_months_between('2015-03-31', '2015-01-31') |
++------------------------------------------------+
+| 2 |
++------------------------------------------------+
+
+/* Last day of month to last day of month-after-next = 2. */
+select int_months_between('2015-03-31', '2015-01-30');
++------------------------------------------------+
+| int_months_between('2015-03-31', '2015-01-30') |
++------------------------------------------------+
+| 2 |
++------------------------------------------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="microseconds_add">
+
+ <dt>
+ <codeph>microseconds_add(timestamp date, int microseconds)</codeph>, <codeph>microseconds_add(timestamp
+ date, bigint microseconds)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">microseconds_add() function</indexterm>
+ <b>Purpose:</b> Returns the specified date and time plus some number of microseconds.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="microseconds_sub">
+
+ <dt>
+ <codeph>microseconds_sub(timestamp date, int microseconds)</codeph>, <codeph>microseconds_sub(timestamp
+ date, bigint microseconds)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">microseconds_sub() function</indexterm>
+ <b>Purpose:</b> Returns the specified date and time minus some number of microseconds.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="milliseconds_add">
+
+ <dt>
+ <codeph>milliseconds_add(timestamp date, int milliseconds)</codeph>, <codeph>milliseconds_add(timestamp
+ date, bigint milliseconds)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">milliseconds_add() function</indexterm>
+ <b>Purpose:</b> Returns the specified date and time plus some number of milliseconds.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="milliseconds_sub">
+
+ <dt>
+ <codeph>milliseconds_sub(timestamp date, int milliseconds)</codeph>, <codeph>milliseconds_sub(timestamp
+ date, bigint milliseconds)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">milliseconds_sub() function</indexterm>
+ <b>Purpose:</b> Returns the specified date and time minus some number of milliseconds.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="minute">
+
+ <dt>
+ <codeph>minute(string date)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">minute() function</indexterm>
+ <b>Purpose:</b> Returns the minute field from a date represented as a string.
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="minutes_add">
+
+ <dt>
+ <codeph>minutes_add(timestamp date, int minutes)</codeph>, <codeph>minutes_add(timestamp date, bigint
+ minutes)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">minutes_add() function</indexterm>
+ <b>Purpose:</b> Returns the specified date and time plus some number of minutes.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="minutes_sub">
+
+ <dt>
+ <codeph>minutes_sub(timestamp date, int minutes)</codeph>, <codeph>minutes_sub(timestamp date, bigint
+ minutes)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">minutes_sub() function</indexterm>
+ <b>Purpose:</b> Returns the specified date and time minus some number of minutes.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="month">
+
+ <dt>
+ <!-- <codeph>month(string date)</codeph> -->
+ <codeph>month(timestamp date)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">month() function</indexterm>
+ <b>Purpose:</b> Returns the month field from the date portion of a <codeph>TIMESTAMP</codeph>.
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="months_add">
+
+ <dt>
+ <codeph>months_add(timestamp date, int months)</codeph>, <codeph>months_add(timestamp date, bigint
+ months)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">months_add() function</indexterm>
+ <b>Purpose:</b> Returns the specified date and time plus some number of months.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="2.3.0" id="months_between">
+
+ <dt>
+ <codeph>months_between(timestamp newer, timestamp older)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">months_between() function</indexterm>
+ <b>Purpose:</b> Returns the number of months between the date portions of two <codeph>TIMESTAMP</codeph> values.
+ Can include a fractional part representing extra days in addition to the full months
+ between the dates. The fractional component is computed by dividing the difference in days by 31 (regardless of the month).
+ <p>
+ <b>Return type:</b> <codeph>double</codeph>
+ </p>
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ Typically used in business contexts, for example to determine whether
+ a specified number of months have passed or whether some end-of-month deadline was reached.
+ </p>
+ <p>
+ If the only consideration is the number of full months and any fractional value is
+ not significant, use <codeph>int_months_between()</codeph> instead.
+ </p>
+ <p>
+ The method of determining the number of elapsed months includes some special handling of
+ months with different numbers of days that creates edge cases for dates between the
+ 28th and 31st days of certain months.
+ </p>
+ <p>
+ If either value is <codeph>NULL</codeph>, which could happen for example when converting a
+ nonexistent date string such as <codeph>'2015-02-29'</codeph> to a <codeph>TIMESTAMP</codeph>,
+ the result is also <codeph>NULL</codeph>.
+ </p>
+ <p>
+ If the first argument represents an earlier time than the second argument, the result is negative.
+ </p>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p>
+ The following examples show how dates that are on the same day of the month
+ are considered to be exactly N months apart, even if the months have different
+ numbers of days.
+ </p>
+<codeblock>select months_between('2015-02-28', '2015-01-28');
++--------------------------------------------+
+| months_between('2015-02-28', '2015-01-28') |
++--------------------------------------------+
+| 1 |
++--------------------------------------------+
+
+select months_between(now(), now() + interval 1 month);
++-------------------------------------------------+
+| months_between(now(), now() + interval 1 month) |
++-------------------------------------------------+
+| -1 |
++-------------------------------------------------+
+
+select months_between(now() + interval 1 year, now());
++------------------------------------------------+
+| months_between(now() + interval 1 year, now()) |
++------------------------------------------------+
+| 12 |
++------------------------------------------------+
+</codeblock>
+ <p>
+ The following examples show how dates that are on the last day of the month
+ are considered to be exactly N months apart, even if the months have different
+ numbers of days. For example, from January 28th to February 28th is exactly one
+ month because the day of the month is identical; January 31st to February 28th
+ is exactly one month because in both cases it is the last day of the month;
+ but January 29th or 30th to February 28th is considered a fractional month.
+ </p>
+<codeblock>select months_between('2015-02-28', '2015-01-31');
++--------------------------------------------+
+| months_between('2015-02-28', '2015-01-31') |
++--------------------------------------------+
+| 1 |
++--------------------------------------------+
+
+select months_between('2015-02-28', '2015-01-29');
++--------------------------------------------+
+| months_between('2015-02-28', '2015-01-29') |
++--------------------------------------------+
+| 0.967741935483871 |
++--------------------------------------------+
+
+select months_between('2015-02-28', '2015-01-30');;
++--------------------------------------------+
+| months_between('2015-02-28', '2015-01-30') |
++--------------------------------------------+
+| 0.935483870967742 |
++--------------------------------------------+
+</codeblock>
+ <p>
+ The following examples show how dates that are not a precise number
+ of months apart result in a fractional return value.
+ </p>
+<codeblock>select months_between('2015-03-01', '2015-01-28');
++--------------------------------------------+
+| months_between('2015-03-01', '2015-01-28') |
++--------------------------------------------+
+| 1.129032258064516 |
++--------------------------------------------+
+
+select months_between('2015-03-01', '2015-02-28');
++--------------------------------------------+
+| months_between('2015-03-01', '2015-02-28') |
++--------------------------------------------+
+| 0.1290322580645161 |
++--------------------------------------------+
+
+select months_between('2015-06-02', '2015-05-29');
++--------------------------------------------+
+| months_between('2015-06-02', '2015-05-29') |
++--------------------------------------------+
+| 0.1290322580645161 |
++--------------------------------------------+
+
+select months_between('2015-03-01', '2015-01-25');
++--------------------------------------------+
+| months_between('2015-03-01', '2015-01-25') |
++--------------------------------------------+
+| 1.225806451612903 |
++--------------------------------------------+
+
+select months_between('2015-03-01', '2015-02-25');
++--------------------------------------------+
+| months_between('2015-03-01', '2015-02-25') |
++--------------------------------------------+
+| 0.2258064516129032 |
++--------------------------------------------+
+
+select months_between('2015-02-28', '2015-02-01');
++--------------------------------------------+
+| months_between('2015-02-28', '2015-02-01') |
++--------------------------------------------+
+| 0.8709677419354839 |
++--------------------------------------------+
+
+select months_between('2015-03-28', '2015-03-01');
++--------------------------------------------+
+| months_between('2015-03-28', '2015-03-01') |
++--------------------------------------------+
+| 0.8709677419354839 |
++--------------------------------------------+
+</codeblock>
+ <p>
+ The following examples show how the time portion of the <codeph>TIMESTAMP</codeph>
+ values are irrelevant for calculating the month interval. Even the fractional part
+ of the result only depends on the number of full days between the argument values,
+ regardless of the time portion.
+ </p>
+<codeblock>select months_between('2015-05-28 23:00:00', '2015-04-28 11:45:00');
++--------------------------------------------------------------+
+| months_between('2015-05-28 23:00:00', '2015-04-28 11:45:00') |
++--------------------------------------------------------------+
+| 1 |
++--------------------------------------------------------------+
+
+select months_between('2015-03-28', '2015-03-01');
++--------------------------------------------+
+| months_between('2015-03-28', '2015-03-01') |
++--------------------------------------------+
+| 0.8709677419354839 |
++--------------------------------------------+
+
+select months_between('2015-03-28 23:00:00', '2015-03-01 11:45:00');
++--------------------------------------------------------------+
+| months_between('2015-03-28 23:00:00', '2015-03-01 11:45:00') |
++--------------------------------------------------------------+
+| 0.8709677419354839 |
++--------------------------------------------------------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="months_sub">
+
+ <dt>
+ <codeph>months_sub(timestamp date, int months)</codeph>, <codeph>months_sub(timestamp date, bigint
+ months)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">months_sub() function</indexterm>
+ <b>Purpose:</b> Returns the specified date and time minus some number of months.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="nanoseconds_add">
+
+ <dt>
+ <codeph>nanoseconds_add(timestamp date, int nanoseconds)</codeph>, <codeph>nanoseconds_add(timestamp
+ date, bigint nanoseconds)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">nanoseconds_add() function</indexterm>
+ <b>Purpose:</b> Returns the specified date and time plus some number of nanoseconds.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="nanoseconds_sub">
+
+ <dt>
+ <codeph>nanoseconds_sub(timestamp date, int nanoseconds)</codeph>, <codeph>nanoseconds_sub(timestamp
+ date, bigint nanoseconds)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">nanoseconds_sub() function</indexterm>
+ <b>Purpose:</b> Returns the specified date and time minus some number of nanoseconds.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="now">
+
+ <dt>
+ <codeph>now()</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">now() function</indexterm>
+<!-- <b>Purpose:</b> Returns the current date and time (in the UTC time zone) as a <codeph>timestamp</codeph> value. -->
+ <b>Purpose:</b> Returns the current date and time (in the local time zone) as a
+ <codeph>timestamp</codeph> value.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ To find a date/time value in the future or the past relative to the current date
+ and time, add or subtract an <codeph>INTERVAL</codeph> expression to the return value of
+ <codeph>now()</codeph>. See <xref href="impala_timestamp.xml#timestamp"/> for examples.
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="second">
+
+ <dt>
+ <codeph>second(string date)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">second() function</indexterm>
+ <b>Purpose:</b> Returns the second field from a date represented as a string.
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="seconds_add">
+
+ <dt>
+ <codeph>seconds_add(timestamp date, int seconds)</codeph>, <codeph>seconds_add(timestamp date, bigint
+ seconds)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">seconds_add() function</indexterm>
+ <b>Purpose:</b> Returns the specified date and time plus some number of seconds.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="seconds_sub">
+
+ <dt>
+ <codeph>seconds_sub(timestamp date, int seconds)</codeph>, <codeph>seconds_sub(timestamp date, bigint
+ seconds)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">seconds_sub() function</indexterm>
+ <b>Purpose:</b> Returns the specified date and time minus some number of seconds.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="subdate">
+
+ <dt>
+ <codeph>subdate(timestamp startdate, int days)</codeph>, <codeph>subdate(timestamp startdate, bigint
+ days)</codeph>,
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">subdate() function</indexterm>
+ <b>Purpose:</b> Subtracts a specified number of days from a <codeph>TIMESTAMP</codeph> value. Similar to
+ <codeph>date_sub()</codeph>, but starts with an actual <codeph>TIMESTAMP</codeph> value instead of a
+ string that is converted to a <codeph>TIMESTAMP</codeph>.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="2.3.0" id="timeofday">
+
+ <dt>
+ <codeph>timeofday()</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">timeofday() function</indexterm>
+ <b>Purpose:</b> Returns a string representation of the current date and time, according to the time of the local system,
+ including any time zone designation.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+ <p>
+ <b>Usage notes:</b> The result value represents similar information as the
+ <codeph>now()</codeph> function, only as a <codeph>STRING</codeph> type
+ and with somewhat different formatting. For example, the day of the week
+ and the time zone identifier are included. This function is intended
+ primarily for compatibility with SQL code from other systems that
+ also have a <codeph>timeofday()</codeph> function. Prefer to use
+ <codeph>now()</codeph> if practical for any new Impala code.
+ </p>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p>
+ The following examples show the format of the <codeph>timeofday()</codeph>
+ return value, illustrate how that value is represented as a <codeph>STRING</codeph>
+ that you can manipulate with string processing functions, and how the format
+ compares with the return value from the <codeph>now()</codeph> function.
+ </p>
+<codeblock>/* Date and time fields in a STRING return value. */
+select timeofday();
++------------------------------+
+| timeofday() |
++------------------------------+
+| Tue Sep 01 15:13:18 2015 PDT |
++------------------------------+
+
+/* The return value can be processed by other string functions. */
+select upper(timeofday());
++------------------------------+
+| upper(timeofday()) |
++------------------------------+
+| TUE SEP 01 15:13:38 2015 PDT |
++------------------------------+
+
+/* The TIMEOFDAY() result is formatted differently than NOW(). NOW() returns a TIMESTAMP. */
+select now(), timeofday();
++-------------------------------+------------------------------+
+| now() | timeofday() |
++-------------------------------+------------------------------+
+| 2015-09-01 15:15:25.930021000 | Tue Sep 01 15:15:25 2015 PDT |
++-------------------------------+------------------------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="2.3.0" id="timestamp_cmp">
+
+ <dt>
+ <codeph>timestamp_cmp(timestamp t1, timestamp t2)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">timestamp_cmp() function</indexterm>
+ <b>Purpose:</b> Tests if one <codeph>TIMESTAMP</codeph> value is
+ newer than, older than, or identical to another <codeph>TIMESTAMP</codeph>
+ <p>
+ <b>Return type:</b> <codeph>int</codeph> (either -1, 0, 1, or <codeph>NULL</codeph>)
+ </p>
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ <b>Usage notes:</b> A comparison function for <codeph>TIMESTAMP</codeph>
+ values that only tests whether the date and time increases, decreases,
+ or stays the same. Similar to the <codeph>sign()</codeph> function
+ for numeric values.
+ </p>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p>
+ The following examples show all the possible return values for <codeph>timestamp_cmp()</codeph>.
+ If the first argument represents a later point in time than the second argument, the result is 1.
+ The amount of the difference is irrelevant, only the fact that one argument is greater than or less than the other.
+ If the first argument represents an earlier point in time than the second argument, the result is -1.
+ If the first and second arguments represent identical points in time, the result is 0.
+ If either argument is <codeph>NULL</codeph>, the result is <codeph>NULL</codeph>.
+ </p>
+<codeblock>/* First argument 'later' than second argument. */
+
+select timestamp_cmp(now() + interval 70 minutes, now());
++---------------------------------------------------+
+| timestamp_cmp(now() + interval 70 minutes, now()) |
++---------------------------------------------------+
+| 1 |
++---------------------------------------------------+
+
+select timestamp_cmp(now() + interval 3 days + interval 5 hours, now());
++------------------------------------------------------------------+
+| timestamp_cmp(now() + interval 3 days + interval 5 hours, now()) |
++------------------------------------------------------------------+
+| 1 |
++------------------------------------------------------------------+
+
+/* First argument 'earlier' than second argument. */
+select timestamp_cmp(now(), now() + interval 2 hours);
++------------------------------------------------+
+| timestamp_cmp(now(), now() + interval 2 hours) |
++------------------------------------------------+
+| -1 |
++------------------------------------------------+
+
+/* Both arguments represent the same point in time. */
+
+select timestamp_cmp(now(), now());
++-----------------------------+
+| timestamp_cmp(now(), now()) |
++-----------------------------+
+| 0 |
++-----------------------------+
+
+select timestamp_cmp(now() + interval 1 hour, now() + interval 60 minutes);
++---------------------------------------------------------------------+
+| timestamp_cmp(now() + interval 1 hour, now() + interval 60 minutes) |
++---------------------------------------------------------------------+
+| 0 |
++---------------------------------------------------------------------+
+
+/* Either argument NULL. */
+
+select timestamp_cmp(now(), null);
++----------------------------+
+| timestamp_cmp(now(), null) |
++----------------------------+
+| NULL |
++----------------------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="to_date">
+
+ <dt>
+ <codeph>to_date(timestamp)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">to_date() function</indexterm>
+ <b>Purpose:</b> Returns a string representation of the date field from a timestamp value.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="to_utc_timestamp">
+
+ <dt>
+ <codeph>to_utc_timestamp(timestamp, string timezone)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">to_utc_timestamp() function</indexterm>
+ <b>Purpose:</b> Converts a specified timestamp value in a specified time zone into the corresponding
+ value for the UTC time zone.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ <p>
+ <b>Usage notes:</b> Often used in combination with the <codeph>now()</codeph> function,
+ to translate local date and time values to the UTC time zone for consistent representation
+ on disk. The opposite of the <codeph>from_utc_timestamp()</codeph> function.
+ </p>
+ <p>
+ <b>Examples:</b> See discussion of time zones in <xref href="impala_timestamp.xml#timestamp"/>
+ for information about using this function for conversions between the local time zone and UTC.
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.4.0" id="trunc">
+
+ <dt>
+ <codeph>trunc(timestamp, string unit)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">trunc() function</indexterm>
+ <b>Purpose:</b> Strips off fields from a <codeph>TIMESTAMP</codeph> value.
+ <p>
+ <b>Unit argument:</b> The <codeph>unit</codeph> argument value is case-sensitive. This argument string
+ can be one of:
+<!-- Some but not all of the arguments from http://docs.oracle.com/cd/B19306_01/server.102/b14200/functions230.htm#i1002084 are supported here.
+ Impala doesn't support 2-digit years or ISO-related years or values derived from ISO years.
+-->
+ <ul>
+ <li>
+ <codeph>SYYYY</codeph>, <codeph>YYYY</codeph>, <codeph>YEAR</codeph>, <codeph>SYEAR</codeph>,
+ <codeph>YYY</codeph>, <codeph>YY</codeph>, <codeph>Y</codeph>: Year.
+ </li>
+
+ <li>
+ <codeph>Q</codeph>: Quarter.
+ </li>
+
+ <li>
+ <codeph>MONTH</codeph>, <codeph>MON</codeph>, <codeph>MM</codeph>, <codeph>RM</codeph>: Month.
+ </li>
+
+ <li>
+ <codeph>WW</codeph>, <codeph>W</codeph>: Same day of the week as the first day of the month.
+ </li>
+
+ <li>
+ <codeph>DDD</codeph>, <codeph>DD</codeph>, <codeph>J</codeph>: Day.
+ </li>
+
+ <li>
+ <codeph>DAY</codeph>, <codeph>DY</codeph>, <codeph>D</codeph>: Starting day of the week.
+ (Not necessarily the current day.)
+ </li>
+
+ <li>
+ <codeph>HH</codeph>, <codeph>HH12</codeph>, <codeph>HH24</codeph>: Hour. A
+ <codeph>TIMESTAMP</codeph> value truncated to the hour is always represented in 24-hour
+ notation, even for the <codeph>HH12</codeph> argument string.
+ </li>
+
+ <li>
+ <codeph>MI</codeph>: Minute.
+ </li>
+ </ul>
+ </p>
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p>
+ Typically used in <codeph>GROUP BY</codeph> queries to aggregate results from the
+ same hour, day, week, month, quarter, and so on. You can also use this function in an <codeph>INSERT
+ ... SELECT</codeph> into a partitioned table to divide <codeph>TIMESTAMP</codeph> values into the
+ correct partition.
+ </p>
+ <p>
+ Because the return value is a <codeph>TIMESTAMP</codeph>, if you cast the result of
+ <codeph>TRUNC()</codeph> to <codeph>STRING</codeph>, you will often see zeroed-out portions such as
+ <codeph>00:00:00</codeph> in the time field. If you only need the individual units such as hour, day,
+ month, or year, use the <codeph>EXTRACT()</codeph> function instead. If you need the individual units
+ from a truncated <codeph>TIMESTAMP</codeph> value, run the <codeph>TRUNCATE()</codeph> function on the
+ original value, then run <codeph>EXTRACT()</codeph> on the result.
+ </p>
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="unix_timestamp">
+
+ <dt>
+ <codeph>unix_timestamp(), unix_timestamp(string datetime), unix_timestamp(string datetime, string
+ format), unix_timestamp(timestamp datetime)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">unix_timestamp() function</indexterm>
+ <b>Purpose:</b> Returns an integer value representing the current date and time as a delta from the Unix
+ epoch, or converts from a specified date and time value represented as a <codeph>TIMESTAMP</codeph> or
+ <codeph>STRING</codeph>.
+ <p>
+ <b>Return type:</b> <codeph rev="2.2.0">bigint</codeph>
+ </p>
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+ <p rev="1.3.0">
+ See <codeph>from_unixtime()</codeph> for details about the patterns you can use in
+ the <codeph>format</codeph> string to represent the position of year, month, day, and so on in the
+ <codeph>date</codeph> string. In Impala 1.3 and higher, you have more flexibility to switch the
+ positions of elements and use different separator characters.
+ </p>
+ <p rev="2.2.3">
+ In CDH 5.4.3 and higher, you can include a trailing uppercase <codeph>Z</codeph> qualifier
+ to indicate <q>Zulu</q> time, a synonym for UTC.
+ </p>
+ <p rev="2.3.0">
+ In CDH 5.5.0 and higher, you can include a timezone offset specified as minutes and hours,
+ provided you also specify the details in the format string argument. The offset is specified in the format
+ string as a plus or minus sign followed by <codeph>hh:mm</codeph>, <codeph>hhmm</codeph>, or <codeph>hh</codeph>.
+ The <codeph>hh</codeph> must be lowercase, to distinguish it from the <codeph>HH</codeph> represent
+ hours in the actual time value. Currently, only numeric timezone offsets are allowed, not symbolic names.
+ </p>
+ <p conref="../shared/impala_common.xml#common/y2k38"/>
+ <p conref="../shared/impala_common.xml#common/datetime_function_chaining"/>
+ <p conref="../shared/impala_common.xml#common/timezone_conversion_caveat"/>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p>
+ The following examples show different ways of turning the same date and time into an integer value.
+ A format string that Impala recognizes by default is interpreted as a UTC date and time.
+ The trailing <codeph>Z</codeph> is a confirmation that the timezone is UTC.
+ If the date and time string is formatted differently, a second argument specifies
+ the position and units for each of the date and time values.
+ </p>
+ <p>
+ The final two examples show how to specify a timezone offset of Pacific Daylight Saving Time, which is 7 hours earlier than UTC.
+ You can use the numeric offset <codeph>-07:00</codeph> and the equivalent suffix of <codeph>-hh:mm</codeph>
+ in the format string, or specify the mnemonic name for the time zone in a call to <codeph>to_utc_timestamp()</codeph>.
+ This particular date and time expressed in PDT translates to a different number than the same date and time expressed in UTC.
+ </p>
+<codeblock rev="2.3.0">
+-- 3 ways of expressing the same date/time in UTC and converting to an integer.
+
+select unix_timestamp('2015-05-15 12:00:00');
++---------------------------------------+
+| unix_timestamp('2015-05-15 12:00:00') |
++---------------------------------------+
+| 1431691200 |
++---------------------------------------+
+
+select unix_timestamp('2015-05-15 12:00:00Z');
++----------------------------------------+
+| unix_timestamp('2015-05-15 12:00:00z') |
++----------------------------------------+
+| 1431691200 |
++----------------------------------------+
+
+select unix_timestamp('May 15, 2015 12:00:00', 'MMM dd, yyyy HH:mm:ss');
++------------------------------------------------------------------+
+| unix_timestamp('may 15, 2015 12:00:00', 'mmm dd, yyyy hh:mm:ss') |
++------------------------------------------------------------------+
+| 1431691200 |
++------------------------------------------------------------------+
+
+-- 2 ways of expressing the same date and time but in a different timezone.
+-- The resulting integer is different from the previous examples.
+
+select unix_timestamp('2015-05-15 12:00:00-07:00', 'yyyy-MM-dd HH:mm:ss-hh:mm');
++--------------------------------------------------------------------------+
+| unix_timestamp('2015-05-15 12:00:00-07:00', 'yyyy-mm-dd hh:mm:ss-hh:mm') |
++--------------------------------------------------------------------------+
+| 1431716400 |
++--------------------------------------------------------------------------+
+
+select unix_timestamp(to_utc_timestamp('2015-05-15 12:00:00', 'PDT'))
++----------------------------------------------------------------+
+| unix_timestamp(to_utc_timestamp('2015-05-15 12:00:00', 'pdt')) |
++----------------------------------------------------------------+
+| 1431716400 |
++----------------------------------------------------------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="weekofyear">
+
+ <dt>
+ <!-- <codeph>weekofyear(string date)</codeph> -->
+ <codeph>weekofyear(timestamp date)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">weekofyear() function</indexterm>
+ <b>Purpose:</b> Returns the corresponding week (1-53) from the date portion of a <codeph>TIMESTAMP</codeph>.
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="weeks_add">
+
+ <dt>
+ <codeph>weeks_add(timestamp date, int weeks)</codeph>, <codeph>weeks_add(timestamp date, bigint
+ weeks)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">weeks_add() function</indexterm>
+ <b>Purpose:</b> Returns the specified date and time plus some number of weeks.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="weeks_sub">
+
+ <dt>
+ <codeph>weeks_sub(timestamp date, int weeks)</codeph>, <codeph>weeks_sub(timestamp date, bigint
+ weeks)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">weeks_sub() function</indexterm>
+ <b>Purpose:</b> Returns the specified date and time minus some number of weeks.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="year">
+
+ <dt>
+ <!-- <codeph>year(string date)</codeph> -->
+ <codeph>year(timestamp date)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">year() function</indexterm>
+ <b>Purpose:</b> Returns the year field from the date portion of a <codeph>TIMESTAMP</codeph>.
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="years_add">
+
+ <dt>
+ <codeph>years_add(timestamp date, int years)</codeph>, <codeph>years_add(timestamp date, bigint
+ years)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">years_add() function</indexterm>
+ <b>Purpose:</b> Returns the specified date and time plus some number of years.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="years_sub">
+
+ <dt>
+ <codeph>years_sub(timestamp date, int years)</codeph>, <codeph>years_sub(timestamp date, bigint
+ years)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">years_sub() function</indexterm>
+ <b>Purpose:</b> Returns the specified date and time minus some number of years.
+ <p>
+ <b>Return type:</b> <codeph>timestamp</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+ </dl>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_ddl.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_ddl.xml b/docs/topics/impala_ddl.xml
new file mode 100644
index 0000000..8e6a3bd
--- /dev/null
+++ b/docs/topics/impala_ddl.xml
@@ -0,0 +1,150 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="ddl">
+
+ <title>DDL Statements</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="DDL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Schemas"/>
+ <data name="Category" value="Tables"/>
+ <data name="Category" value="Databases"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ DDL refers to <q>Data Definition Language</q>, a subset of SQL statements that change the structure of the
+ database schema in some way, typically by creating, deleting, or modifying schema objects such as databases,
+ tables, and views. Most Impala DDL statements start with the keywords <codeph>CREATE</codeph>,
+ <codeph>DROP</codeph>, or <codeph>ALTER</codeph>.
+ </p>
+
+ <p>
+ The Impala DDL statements are:
+ </p>
+
+ <ul>
+ <li>
+ <xref href="impala_alter_table.xml#alter_table"/>
+ </li>
+
+ <li>
+ <xref href="impala_alter_view.xml#alter_view"/>
+ </li>
+
+ <li>
+ <xref href="impala_compute_stats.xml#compute_stats"/>
+ </li>
+
+ <li>
+ <xref href="impala_create_database.xml#create_database"/>
+ </li>
+
+ <li>
+ <xref href="impala_create_function.xml#create_function"/>
+ </li>
+
+ <li rev="2.0.0">
+ <xref href="impala_create_role.xml#create_role"/>
+ </li>
+
+ <li>
+ <xref href="impala_create_table.xml#create_table"/>
+ </li>
+
+ <li>
+ <xref href="impala_create_view.xml#create_view"/>
+ </li>
+
+ <li>
+ <xref href="impala_drop_database.xml#drop_database"/>
+ </li>
+
+ <li>
+ <xref href="impala_drop_function.xml#drop_function"/>
+ </li>
+
+ <li rev="2.0.0">
+ <xref href="impala_drop_role.xml#drop_role"/>
+ </li>
+
+ <li>
+ <xref href="impala_drop_table.xml#drop_table"/>
+ </li>
+
+ <li>
+ <xref href="impala_drop_view.xml#drop_view"/>
+ </li>
+
+ <li rev="2.0.0">
+ <xref href="impala_grant.xml#grant"/>
+ </li>
+
+ <li rev="2.0.0">
+ <xref href="impala_revoke.xml#revoke"/>
+ </li>
+ </ul>
+
+ <p>
+ After Impala executes a DDL command, information about available tables, columns, views, partitions, and so
+ on is automatically synchronized between all the Impala nodes in a cluster. (Prior to Impala 1.2, you had to
+ issue a <codeph>REFRESH</codeph> or <codeph>INVALIDATE METADATA</codeph> statement manually on the other
+ nodes to make them aware of the changes.)
+ </p>
+
+ <p>
+ If the timing of metadata updates is significant, for example if you use round-robin scheduling where each
+ query could be issued through a different Impala node, you can enable the
+ <xref href="impala_sync_ddl.xml#sync_ddl">SYNC_DDL</xref> query option to make the DDL statement wait until
+ all nodes have been notified about the metadata changes.
+ </p>
+
+ <p rev="2.2.0">
+ See <xref href="impala_s3.xml#s3"/> for details about how Impala DDL statements interact with
+ tables and partitions stored in the Amazon S3 filesystem.
+ </p>
+
+ <p>
+ Although the <codeph>INSERT</codeph> statement is officially classified as a DML (data manipulation language)
+ statement, it also involves metadata changes that must be broadcast to all Impala nodes, and so is also
+ affected by the <codeph>SYNC_DDL</codeph> query option.
+ </p>
+
+ <p>
+ Because the <codeph>SYNC_DDL</codeph> query option makes each DDL operation take longer than normal, you
+ might only enable it before the last DDL operation in a sequence. For example, if you are running a script
+ that issues multiple of DDL operations to set up an entire new schema, add several new partitions, and so on,
+ you might minimize the performance overhead by enabling the query option only before the last
+ <codeph>CREATE</codeph>, <codeph>DROP</codeph>, <codeph>ALTER</codeph>, or <codeph>INSERT</codeph> statement.
+ The script only finishes when all the relevant metadata changes are recognized by all the Impala nodes, so
+ you could connect to any node and issue queries through it.
+ </p>
+
+ <p>
+ The classification of DDL, DML, and other statements is not necessarily the same between Impala and Hive.
+ Impala organizes these statements in a way intended to be familiar to people familiar with relational
+ databases or data warehouse products. Statements that modify the metastore database, such as <codeph>COMPUTE
+ STATS</codeph>, are classified as DDL. Statements that only query the metastore database, such as
+ <codeph>SHOW</codeph> or <codeph>DESCRIBE</codeph>, are put into a separate category of utility statements.
+ </p>
+
+ <note>
+ The query types shown in the Impala debug web user interface might not match exactly the categories listed
+ here. For example, currently the <codeph>USE</codeph> statement is shown as DDL in the debug web UI. The
+ query types shown in the debug web UI are subject to change, for improved consistency.
+ </note>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ The other major classifications of SQL statements are data manipulation language (see
+ <xref href="impala_dml.xml#dml"/>) and queries (see <xref href="impala_select.xml#select"/>).
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_debug_action.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_debug_action.xml b/docs/topics/impala_debug_action.xml
new file mode 100644
index 0000000..b931979
--- /dev/null
+++ b/docs/topics/impala_debug_action.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="debug_action">
+
+ <title>DEBUG_ACTION Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">DEBUG_ACTION query option</indexterm>
+ Introduces artificial problem conditions within queries. For internal Cloudera debugging and troubleshooting.
+ </p>
+
+ <p>
+ <b>Type:</b> <codeph>STRING</codeph>
+ </p>
+
+ <p>
+ <b>Default:</b> empty string
+ </p>
+ </conbody>
+</concept>
[22/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Posted by jr...@apache.org.
First try at porting over the source files necessary for the Impala SQL
Reference.
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/463ddf92
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/463ddf92
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/463ddf92
Branch: refs/heads/doc_prototype
Commit: 463ddf9243da95f1ab68a4f9b489ef31094e6fc3
Parents: 0ad935b
Author: John Russell <jr...@cloudera.com>
Authored: Tue Jul 26 16:02:54 2016 -0700
Committer: John Russell <jr...@cloudera.com>
Committed: Tue Jul 26 16:02:54 2016 -0700
----------------------------------------------------------------------
docs/impala.ditamap | 252 ++
docs/impala_sqlref.ditamap | 146 +
docs/shared/ImpalaVariables.xml | 46 +
docs/shared/impala_common.xml | 2477 ++++++++++++++++
.../impala_abort_on_default_limit_exceeded.xml | 20 +
docs/topics/impala_abort_on_error.xml | 40 +
docs/topics/impala_aggregate_functions.xml | 33 +
docs/topics/impala_aliases.xml | 71 +
.../topics/impala_allow_unsupported_formats.xml | 29 +
docs/topics/impala_alter_table.xml | 411 +++
docs/topics/impala_alter_view.xml | 73 +
docs/topics/impala_analytic_functions.xml | 1742 +++++++++++
docs/topics/impala_appx_count_distinct.xml | 77 +
docs/topics/impala_appx_median.xml | 122 +
docs/topics/impala_array.xml | 266 ++
docs/topics/impala_avg.xml | 223 ++
docs/topics/impala_batch_size.xml | 33 +
docs/topics/impala_bigint.xml | 100 +
docs/topics/impala_bit_functions.xml | 798 +++++
docs/topics/impala_boolean.xml | 128 +
docs/topics/impala_char.xml | 275 ++
docs/topics/impala_comments.xml | 51 +
docs/topics/impala_complex_types.xml | 2725 ++++++++++++++++++
docs/topics/impala_compression_codec.xml | 95 +
docs/topics/impala_compute_stats.xml | 418 +++
docs/topics/impala_conditional_functions.xml | 443 +++
docs/topics/impala_conversion_functions.xml | 758 +++++
docs/topics/impala_count.xml | 230 ++
docs/topics/impala_create_database.xml | 115 +
docs/topics/impala_create_function.xml | 291 ++
docs/topics/impala_create_role.xml | 66 +
docs/topics/impala_create_table.xml | 650 +++++
docs/topics/impala_create_view.xml | 136 +
docs/topics/impala_databases.xml | 65 +
docs/topics/impala_datatypes.xml | 43 +
docs/topics/impala_datetime_functions.xml | 1505 ++++++++++
docs/topics/impala_ddl.xml | 150 +
docs/topics/impala_debug_action.xml | 28 +
docs/topics/impala_decimal.xml | 836 ++++++
docs/topics/impala_default_order_by_limit.xml | 34 +
docs/topics/impala_delete.xml | 64 +
docs/topics/impala_describe.xml | 561 ++++
docs/topics/impala_disable_codegen.xml | 36 +
docs/topics/impala_disable_unsafe_spills.xml | 48 +
docs/topics/impala_distinct.xml | 59 +
docs/topics/impala_dml.xml | 85 +
docs/topics/impala_double.xml | 100 +
docs/topics/impala_drop_database.xml | 124 +
docs/topics/impala_drop_function.xml | 60 +
docs/topics/impala_drop_role.xml | 67 +
docs/topics/impala_drop_stats.xml | 275 ++
docs/topics/impala_drop_table.xml | 142 +
docs/topics/impala_drop_view.xml | 48 +
.../impala_exec_single_node_rows_threshold.xml | 91 +
docs/topics/impala_explain.xml | 224 ++
docs/topics/impala_explain_level.xml | 338 +++
docs/topics/impala_float.xml | 94 +
docs/topics/impala_functions.xml | 162 ++
docs/topics/impala_functions_overview.xml | 116 +
docs/topics/impala_grant.xml | 117 +
docs/topics/impala_group_by.xml | 137 +
docs/topics/impala_group_concat.xml | 133 +
docs/topics/impala_having.xml | 42 +
docs/topics/impala_hbase_cache_blocks.xml | 34 +
docs/topics/impala_hbase_caching.xml | 39 +
docs/topics/impala_hints.xml | 247 ++
docs/topics/impala_identifiers.xml | 114 +
docs/topics/impala_insert.xml | 676 +++++
docs/topics/impala_int.xml | 95 +
docs/topics/impala_invalidate_metadata.xml | 236 ++
docs/topics/impala_joins.xml | 520 ++++
docs/topics/impala_langref.xml | 179 ++
docs/topics/impala_langref_sql.xml | 35 +
docs/topics/impala_langref_unsupported.xml | 296 ++
docs/topics/impala_limit.xml | 149 +
docs/topics/impala_literals.xml | 384 +++
docs/topics/impala_live_progress.xml | 81 +
docs/topics/impala_live_summary.xml | 207 ++
docs/topics/impala_load_data.xml | 237 ++
docs/topics/impala_map.xml | 264 ++
docs/topics/impala_math_functions.xml | 1336 +++++++++
docs/topics/impala_max.xml | 192 ++
docs/topics/impala_max_errors.xml | 44 +
docs/topics/impala_max_io_buffers.xml | 28 +
docs/topics/impala_max_scan_range_length.xml | 45 +
docs/topics/impala_mem_limit.xml | 208 ++
docs/topics/impala_min.xml | 191 ++
docs/topics/impala_misc_functions.xml | 148 +
docs/topics/impala_ndv.xml | 133 +
docs/topics/impala_num_nodes.xml | 45 +
docs/topics/impala_num_scanner_threads.xml | 32 +
docs/topics/impala_offset.xml | 64 +
docs/topics/impala_operators.xml | 1262 ++++++++
docs/topics/impala_order_by.xml | 316 ++
.../topics/impala_parquet_compression_codec.xml | 25 +
docs/topics/impala_parquet_file_size.xml | 82 +
docs/topics/impala_porting.xml | 622 ++++
docs/topics/impala_query_options.xml | 75 +
docs/topics/impala_query_timeout_s.xml | 51 +
docs/topics/impala_real.xml | 46 +
docs/topics/impala_refresh.xml | 234 ++
docs/topics/impala_request_pool.xml | 45 +
.../impala_reservation_request_timeout.xml | 35 +
docs/topics/impala_revoke.xml | 96 +
docs/topics/impala_schema_objects.xml | 57 +
docs/topics/impala_select.xml | 203 ++
docs/topics/impala_set.xml | 90 +
docs/topics/impala_show.xml | 1263 ++++++++
docs/topics/impala_smallint.xml | 101 +
docs/topics/impala_stddev.xml | 116 +
docs/topics/impala_string.xml | 161 ++
docs/topics/impala_string_functions.xml | 719 +++++
docs/topics/impala_struct.xml | 406 +++
docs/topics/impala_subqueries.xml | 318 ++
docs/topics/impala_sum.xml | 236 ++
docs/topics/impala_support_start_over.xml | 29 +
docs/topics/impala_sync_ddl.xml | 56 +
docs/topics/impala_tables.xml | 258 ++
docs/topics/impala_timestamp.xml | 441 +++
docs/topics/impala_tinyint.xml | 101 +
docs/topics/impala_truncate_table.xml | 151 +
docs/topics/impala_udf.xml | 1759 +++++++++++
docs/topics/impala_union.xml | 150 +
docs/topics/impala_update.xml | 64 +
docs/topics/impala_use.xml | 77 +
docs/topics/impala_v_cpu_cores.xml | 37 +
docs/topics/impala_varchar.xml | 215 ++
docs/topics/impala_variance.xml | 127 +
docs/topics/impala_views.xml | 185 ++
docs/topics/impala_with.xml | 64 +
130 files changed, 35656 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/impala.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala.ditamap b/docs/impala.ditamap
new file mode 100644
index 0000000..f35f84a
--- /dev/null
+++ b/docs/impala.ditamap
@@ -0,0 +1,252 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE map PUBLIC "-//OASIS//DTD DITA Map//EN" "map.dtd">
+<map id="impala">
+ <title>Impala</title>
+ <topicmeta>
+ <prodinfo conref="shared/ImpalaVariables.xml#impala_vars/prodinfo_pmw_tmv_km">
+ <prodname/>
+ <vrmlist>
+ <vrm version="version_dlq_gry_sm"/>
+ </vrmlist>
+ </prodinfo>
+ </topicmeta>
+<!-- Here is the former site of the Release Notes. Experimenting with moving those to the end for better PDF experience. -->
+<!-- See if there's a way to move include the Release Notes here in HTML, but after Installing/Using for PDF. -->
+<!-- Bring the entire contents of the Installing and Using DITA map in here. -->
+<!--
+ <topicref audience="standalone" href="/Content/Installing-and-Using-Impala_xi42980.xml">
+ <mapref href="/Content/Installing-and-Using-Impala_xi42979.ditamap" format="ditamap"/>
+ </topicref>
+ -->
+ <topicref href="topics/impala_intro.xml" audience="standalone"/>
+ <topicref href="topics/impala_concepts.xml">
+ <topicref href="topics/impala_components.xml"/>
+ <topicref href="topics/impala_development.xml"/>
+ <topicref href="topics/impala_hadoop.xml"/>
+ </topicref>
+ <topicref href="topics/impala_planning.xml">
+ <topicref href="topics/impala_prereqs.xml#prereqs"/>
+ <topicref href="topics/impala_cluster_sizing.xml"/>
+ <topicref href="topics/impala_schema_design.xml"/>
+ </topicref>
+ <topicref audience="standalone" href="topics/impala_install.xml#install">
+ <topicref href="topics/impala_cm_installation.xml#cm_installation"/>
+ <topicref href="topics/impala_noncm_installation.xml#noncm_installation"/>
+ </topicref>
+ <topicref audience="standalone" href="topics/impala_config.xml">
+ <topicref href="topics/impala_config_performance.xml"/>
+ <topicref href="topics/impala_odbc.xml"/>
+ <topicref href="topics/impala_jdbc.xml"/>
+ </topicref>
+ <topicref audience="standalone" href="topics/impala_upgrading.xml"/>
+ <topicref audience="standalone" href="topics/impala_processes.xml">
+ <topicref href="topics/impala_config_options.xml"/>
+ </topicref>
+ <topicref href="topics/impala_tutorial.xml"/>
+ <topicref href="topics/impala_admin.xml">
+ <topicref audience="standalone" href="topics/impala_admission.xml"/>
+ <topicref audience="standalone" href="topics/impala_resource_management.xml"/>
+ <topicref href="topics/impala_timeouts.xml"/>
+ <topicref href="topics/impala_proxy.xml"/>
+ <topicref href="topics/impala_disk_space.xml"/>
+ <topicref audience="integrated" href="topics/impala_auditing.xml"/>
+ <topicref audience="integrated" href="topics/impala_lineage.xml"/>
+ </topicref>
+ <topicref audience="standalone" href="topics/impala_security.xml">
+ <topicref href="topics/impala_security_guidelines.xml"/>
+ <topicref href="topics/impala_security_files.xml"/>
+ <topicref href="topics/impala_security_install.xml"/>
+ <topicref href="topics/impala_security_metastore.xml"/>
+ <topicref href="topics/impala_security_webui.xml"/>
+ <topicref href="topics/impala_ssl.xml"/>
+ <topicref href="topics/impala_authorization.xml"/>
+ <topicref href="topics/impala_authentication.xml">
+ <topicref href="topics/impala_kerberos.xml"/>
+ <topicref href="topics/impala_ldap.xml"/>
+ <topicref href="topics/impala_mixed_security.xml"/>
+ <topicref href="topics/impala_delegation.xml"/>
+ </topicref>
+ <topicref href="topics/impala_auditing.xml"/>
+ <topicref href="topics/impala_lineage.xml"/>
+ </topicref>
+ <topicref href="topics/impala_langref.xml">
+ <topicref href="topics/impala_comments.xml"/>
+ <topicref href="topics/impala_datatypes.xml">
+ <topicref href="topics/impala_array.xml"/>
+ <topicref href="topics/impala_bigint.xml"/>
+ <topicref href="topics/impala_boolean.xml"/>
+ <topicref href="topics/impala_char.xml"/>
+ <topicref href="topics/impala_decimal.xml"/>
+ <topicref href="topics/impala_double.xml"/>
+ <topicref href="topics/impala_float.xml"/>
+ <topicref href="topics/impala_int.xml"/>
+ <topicref href="topics/impala_map.xml"/>
+ <topicref href="topics/impala_real.xml"/>
+ <topicref href="topics/impala_smallint.xml"/>
+ <topicref href="topics/impala_string.xml"/>
+ <topicref href="topics/impala_struct.xml"/>
+ <topicref href="topics/impala_timestamp.xml"/>
+ <topicref href="topics/impala_tinyint.xml"/>
+ <topicref href="topics/impala_varchar.xml"/>
+ <topicref href="topics/impala_complex_types.xml"/>
+ </topicref>
+ <topicref href="topics/impala_literals.xml"/>
+ <topicref href="topics/impala_operators.xml"/>
+ <topicref href="topics/impala_schema_objects.xml">
+ <topicref href="topics/impala_aliases.xml"/>
+ <topicref href="topics/impala_databases.xml"/>
+ <topicref href="topics/impala_functions_overview.xml"/>
+ <topicref href="topics/impala_identifiers.xml"/>
+ <topicref href="topics/impala_tables.xml"/>
+ <topicref href="topics/impala_views.xml"/>
+ </topicref>
+ <topicref href="topics/impala_langref_sql.xml">
+ <topicref href="topics/impala_ddl.xml"/>
+ <topicref href="topics/impala_dml.xml"/>
+ <topicref href="topics/impala_alter_table.xml"/>
+ <topicref href="topics/impala_alter_view.xml"/>
+ <topicref href="topics/impala_compute_stats.xml"/>
+ <topicref href="topics/impala_create_database.xml"/>
+ <topicref href="topics/impala_create_function.xml"/>
+ <topicref href="topics/impala_create_role.xml"/>
+ <topicref href="topics/impala_create_table.xml"/>
+ <topicref href="topics/impala_create_view.xml"/>
+ <topicref audience="impala_next" href="topics/impala_delete.xml"/>
+ <topicref href="topics/impala_describe.xml"/>
+ <topicref href="topics/impala_drop_database.xml"/>
+ <topicref href="topics/impala_drop_function.xml"/>
+ <topicref href="topics/impala_drop_role.xml"/>
+ <topicref href="topics/impala_drop_stats.xml"/>
+ <topicref href="topics/impala_drop_table.xml"/>
+ <topicref href="topics/impala_drop_view.xml"/>
+ <topicref href="topics/impala_explain.xml"/>
+ <topicref href="topics/impala_grant.xml"/>
+ <topicref href="topics/impala_insert.xml"/>
+ <topicref href="topics/impala_invalidate_metadata.xml"/>
+ <topicref href="topics/impala_load_data.xml"/>
+ <topicref href="topics/impala_refresh.xml"/>
+ <topicref href="topics/impala_revoke.xml"/>
+ <topicref href="topics/impala_select.xml">
+ <topicref href="topics/impala_joins.xml"/>
+ <topicref href="topics/impala_order_by.xml"/>
+ <topicref href="topics/impala_group_by.xml"/>
+ <topicref href="topics/impala_having.xml"/>
+ <topicref href="topics/impala_limit.xml"/>
+ <topicref href="topics/impala_offset.xml"/>
+ <topicref href="topics/impala_union.xml"/>
+ <topicref href="topics/impala_subqueries.xml"/>
+ <topicref href="topics/impala_with.xml"/>
+ <topicref href="topics/impala_distinct.xml"/>
+ <topicref href="topics/impala_hints.xml"/>
+ </topicref>
+ <topicref href="topics/impala_set.xml"/>
+ <topicref href="topics/impala_query_options.xml">
+ <topicref href="topics/impala_abort_on_default_limit_exceeded.xml"/>
+ <topicref href="topics/impala_abort_on_error.xml"/>
+ <topicref href="topics/impala_allow_unsupported_formats.xml"/>
+ <topicref href="topics/impala_appx_count_distinct.xml"/>
+ <topicref href="topics/impala_batch_size.xml"/>
+ <topicref href="topics/impala_compression_codec.xml"/>
+ <topicref href="topics/impala_debug_action.xml"/>
+ <topicref href="topics/impala_default_order_by_limit.xml"/>
+ <topicref href="topics/impala_disable_codegen.xml"/>
+ <topicref href="topics/impala_disable_unsafe_spills.xml"/>
+ <topicref href="topics/impala_exec_single_node_rows_threshold.xml"/>
+ <topicref href="topics/impala_explain_level.xml"/>
+ <topicref href="topics/impala_hbase_cache_blocks.xml"/>
+ <topicref href="topics/impala_hbase_caching.xml"/>
+ <topicref href="topics/impala_live_progress.xml"/>
+ <topicref href="topics/impala_live_summary.xml"/>
+ <topicref href="topics/impala_max_errors.xml"/>
+ <topicref href="topics/impala_max_io_buffers.xml"/>
+ <topicref href="topics/impala_max_scan_range_length.xml"/>
+ <topicref href="topics/impala_mem_limit.xml"/>
+ <topicref href="topics/impala_num_nodes.xml"/>
+ <topicref href="topics/impala_num_scanner_threads.xml"/>
+ <topicref href="topics/impala_parquet_compression_codec.xml"/>
+ <topicref href="topics/impala_parquet_file_size.xml"/>
+ <topicref href="topics/impala_query_timeout_s.xml"/>
+ <topicref href="topics/impala_request_pool.xml"/>
+ <topicref href="topics/impala_reservation_request_timeout.xml"/>
+ <topicref href="topics/impala_support_start_over.xml"/>
+ <topicref href="topics/impala_sync_ddl.xml"/>
+ <topicref href="topics/impala_v_cpu_cores.xml"/>
+ </topicref>
+ <topicref href="topics/impala_show.xml"/>
+ <topicref href="topics/impala_truncate_table.xml"/>
+ <topicref audience="impala_next" href="topics/impala_update.xml"/>
+ <topicref href="topics/impala_use.xml"/>
+ </topicref>
+ <topicref href="topics/impala_functions.xml">
+ <topicref href="topics/impala_math_functions.xml"/>
+ <topicref href="topics/impala_bit_functions.xml"/>
+ <topicref href="topics/impala_conversion_functions.xml"/>
+ <topicref href="topics/impala_datetime_functions.xml"/>
+ <topicref href="topics/impala_conditional_functions.xml"/>
+ <topicref href="topics/impala_string_functions.xml"/>
+ <topicref href="topics/impala_misc_functions.xml"/>
+ <topicref href="topics/impala_aggregate_functions.xml">
+ <topicref href="topics/impala_appx_median.xml"/>
+ <topicref href="topics/impala_avg.xml"/>
+ <topicref href="topics/impala_count.xml"/>
+ <topicref href="topics/impala_group_concat.xml"/>
+ <topicref href="topics/impala_max.xml"/>
+ <topicref href="topics/impala_min.xml"/>
+ <topicref href="topics/impala_ndv.xml"/>
+ <topicref href="topics/impala_stddev.xml"/>
+ <topicref href="topics/impala_sum.xml"/>
+ <topicref href="topics/impala_variance.xml"/>
+ </topicref>
+ <topicref href="topics/impala_analytic_functions.xml"/>
+ <topicref href="topics/impala_udf.xml"/>
+ </topicref>
+ <topicref href="topics/impala_langref_unsupported.xml"/>
+ <topicref href="topics/impala_porting.xml"/>
+ </topicref>
+ <topicref href="topics/impala_impala_shell.xml">
+ <topicref href="topics/impala_shell_options.xml"/>
+ <topicref href="topics/impala_connecting.xml"/>
+ <topicref href="topics/impala_shell_running_commands.xml"/>
+ <topicref href="topics/impala_shell_commands.xml"/>
+ </topicref>
+ <topicref href="topics/impala_performance.xml">
+ <topicref href="topics/impala_perf_cookbook.xml"/>
+ <topicref href="topics/impala_perf_joins.xml"/>
+ <topicref href="topics/impala_perf_stats.xml"/>
+ <topicref href="topics/impala_perf_benchmarking.xml"/>
+ <topicref href="topics/impala_perf_resources.xml"/>
+ <topicref href="topics/impala_perf_hdfs_caching.xml"/>
+ <topicref href="topics/impala_perf_testing.xml"/>
+ <topicref href="topics/impala_explain_plan.xml"/>
+ <topicref href="topics/impala_perf_skew.xml"/>
+ <topicref audience="Cloudera" href="topics/impala_perf_ddl.xml"/>
+ </topicref>
+ <topicref href="topics/impala_scalability.xml"/>
+ <topicref href="topics/impala_partitioning.xml"/>
+ <topicref href="topics/impala_file_formats.xml">
+ <topicref href="topics/impala_txtfile.xml"/>
+ <topicref href="topics/impala_parquet.xml"/>
+ <topicref href="topics/impala_avro.xml"/>
+ <topicref href="topics/impala_rcfile.xml"/>
+ <topicref href="topics/impala_seqfile.xml"/>
+ </topicref>
+ <topicref audience="impala_next" href="topics/impala_kudu.xml"/>
+ <topicref href="topics/impala_hbase.xml"/>
+ <topicref href="topics/impala_s3.xml"/>
+ <topicref href="topics/impala_isilon.xml"/>
+ <topicref href="topics/impala_logging.xml"/>
+ <topicref href="topics/impala_troubleshooting.xml">
+ <topicref href="topics/impala_webui.xml"/>
+ </topicref>
+ <topicref href="topics/impala_ports.xml"/>
+ <topicref href="topics/impala_reserved_words.xml"/>
+<!-- End of former contents of Installing-and-Using-Impala_xi42979.ditamap. -->
+<!-- Need to make this rg_ topic disappear from the Impala PDF. Put audience="standalone"
+ inside the topic itself? -->
+ <topicref audience="standalone" href="topics/rg_impala_vd.xml"/>
+ <topicref audience="standalone" href="topics/impala_faq.xml"/>
+ <topicref audience="standalone" href="topics/impala_release_notes.xml">
+ <mapref href="Cloudera-Impala-Release-Notes.ditamap" format="ditamap"
+ audience="standalone"/>
+ </topicref>
+</map>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/impala_sqlref.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala_sqlref.ditamap b/docs/impala_sqlref.ditamap
new file mode 100644
index 0000000..1b1c345
--- /dev/null
+++ b/docs/impala_sqlref.ditamap
@@ -0,0 +1,146 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE map PUBLIC "-//OASIS//DTD DITA Map//EN" "map.dtd">
+<map id="impala_sqlref">
+ <title>Impala SQL Reference</title>
+ <topicmeta>
+ <prodinfo conref="shared/ImpalaVariables.xml#impala_vars/prodinfo_pmw_tmv_km">
+ <prodname/>
+ <vrmlist>
+ <vrm version="version_dlq_gry_sm"/>
+ </vrmlist>
+ </prodinfo>
+ </topicmeta>
+ <topicref href="topics/impala_langref.xml"/>
+ <topicref href="topics/impala_comments.xml"/>
+ <topicref href="topics/impala_datatypes.xml">
+ <topicref href="topics/impala_array.xml"/>
+ <topicref href="topics/impala_bigint.xml"/>
+ <topicref href="topics/impala_boolean.xml"/>
+ <topicref href="topics/impala_char.xml"/>
+ <topicref href="topics/impala_decimal.xml"/>
+ <topicref href="topics/impala_double.xml"/>
+ <topicref href="topics/impala_float.xml"/>
+ <topicref href="topics/impala_int.xml"/>
+ <topicref href="topics/impala_map.xml"/>
+ <topicref href="topics/impala_real.xml"/>
+ <topicref href="topics/impala_smallint.xml"/>
+ <topicref href="topics/impala_string.xml"/>
+ <topicref href="topics/impala_struct.xml"/>
+ <topicref href="topics/impala_timestamp.xml"/>
+ <topicref href="topics/impala_tinyint.xml"/>
+ <topicref href="topics/impala_varchar.xml"/>
+ <topicref href="topics/impala_complex_types.xml"/>
+ </topicref>
+ <topicref href="topics/impala_literals.xml"/>
+ <topicref href="topics/impala_operators.xml"/>
+ <topicref href="topics/impala_schema_objects.xml">
+ <topicref href="topics/impala_aliases.xml"/>
+ <topicref href="topics/impala_databases.xml"/>
+ <topicref href="topics/impala_functions_overview.xml"/>
+ <topicref href="topics/impala_identifiers.xml"/>
+ <topicref href="topics/impala_tables.xml"/>
+ <topicref href="topics/impala_views.xml"/>
+ </topicref>
+ <topicref href="topics/impala_langref_sql.xml">
+ <topicref href="topics/impala_ddl.xml"/>
+ <topicref href="topics/impala_dml.xml"/>
+ <topicref href="topics/impala_alter_table.xml"/>
+ <topicref href="topics/impala_alter_view.xml"/>
+ <topicref href="topics/impala_compute_stats.xml"/>
+ <topicref href="topics/impala_create_database.xml"/>
+ <topicref href="topics/impala_create_function.xml"/>
+ <topicref href="topics/impala_create_role.xml"/>
+ <topicref href="topics/impala_create_table.xml"/>
+ <topicref href="topics/impala_create_view.xml"/>
+ <topicref audience="impala_next" href="topics/impala_delete.xml"/>
+ <topicref href="topics/impala_describe.xml"/>
+ <topicref href="topics/impala_drop_database.xml"/>
+ <topicref href="topics/impala_drop_function.xml"/>
+ <topicref href="topics/impala_drop_role.xml"/>
+ <topicref href="topics/impala_drop_stats.xml"/>
+ <topicref href="topics/impala_drop_table.xml"/>
+ <topicref href="topics/impala_drop_view.xml"/>
+ <topicref href="topics/impala_explain.xml"/>
+ <topicref href="topics/impala_grant.xml"/>
+ <topicref href="topics/impala_insert.xml"/>
+ <topicref href="topics/impala_invalidate_metadata.xml"/>
+ <topicref href="topics/impala_load_data.xml"/>
+ <topicref href="topics/impala_refresh.xml"/>
+ <topicref href="topics/impala_revoke.xml"/>
+ <topicref href="topics/impala_select.xml">
+ <topicref href="topics/impala_joins.xml"/>
+ <topicref href="topics/impala_order_by.xml"/>
+ <topicref href="topics/impala_group_by.xml"/>
+ <topicref href="topics/impala_having.xml"/>
+ <topicref href="topics/impala_limit.xml"/>
+ <topicref href="topics/impala_offset.xml"/>
+ <topicref href="topics/impala_union.xml"/>
+ <topicref href="topics/impala_subqueries.xml"/>
+ <topicref href="topics/impala_with.xml"/>
+ <topicref href="topics/impala_distinct.xml"/>
+ <topicref href="topics/impala_hints.xml"/>
+ </topicref>
+ <topicref href="topics/impala_set.xml"/>
+ <topicref href="topics/impala_query_options.xml">
+ <topicref href="topics/impala_abort_on_default_limit_exceeded.xml"/>
+ <topicref href="topics/impala_abort_on_error.xml"/>
+ <topicref href="topics/impala_allow_unsupported_formats.xml"/>
+ <topicref href="topics/impala_appx_count_distinct.xml"/>
+ <topicref href="topics/impala_batch_size.xml"/>
+ <topicref href="topics/impala_compression_codec.xml"/>
+ <topicref href="topics/impala_debug_action.xml"/>
+ <topicref href="topics/impala_default_order_by_limit.xml"/>
+ <topicref href="topics/impala_disable_codegen.xml"/>
+ <topicref href="topics/impala_disable_unsafe_spills.xml"/>
+ <topicref href="topics/impala_exec_single_node_rows_threshold.xml"/>
+ <topicref href="topics/impala_explain_level.xml"/>
+ <topicref href="topics/impala_hbase_cache_blocks.xml"/>
+ <topicref href="topics/impala_hbase_caching.xml"/>
+ <topicref href="topics/impala_live_progress.xml"/>
+ <topicref href="topics/impala_live_summary.xml"/>
+ <topicref href="topics/impala_max_errors.xml"/>
+ <topicref href="topics/impala_max_io_buffers.xml"/>
+ <topicref href="topics/impala_max_scan_range_length.xml"/>
+ <topicref href="topics/impala_mem_limit.xml"/>
+ <topicref href="topics/impala_num_nodes.xml"/>
+ <topicref href="topics/impala_num_scanner_threads.xml"/>
+ <topicref href="topics/impala_parquet_compression_codec.xml"/>
+ <topicref href="topics/impala_parquet_file_size.xml"/>
+ <topicref href="topics/impala_query_timeout_s.xml"/>
+ <topicref href="topics/impala_request_pool.xml"/>
+ <topicref href="topics/impala_reservation_request_timeout.xml"/>
+ <topicref href="topics/impala_support_start_over.xml"/>
+ <topicref href="topics/impala_sync_ddl.xml"/>
+ <topicref href="topics/impala_v_cpu_cores.xml"/>
+ </topicref>
+ <topicref href="topics/impala_show.xml"/>
+ <topicref href="topics/impala_truncate_table.xml"/>
+ <topicref audience="impala_next" href="topics/impala_update.xml"/>
+ <topicref href="topics/impala_use.xml"/>
+ </topicref>
+ <topicref href="topics/impala_functions.xml">
+ <topicref href="topics/impala_math_functions.xml"/>
+ <topicref href="topics/impala_bit_functions.xml"/>
+ <topicref href="topics/impala_conversion_functions.xml"/>
+ <topicref href="topics/impala_datetime_functions.xml"/>
+ <topicref href="topics/impala_conditional_functions.xml"/>
+ <topicref href="topics/impala_string_functions.xml"/>
+ <topicref href="topics/impala_misc_functions.xml"/>
+ <topicref href="topics/impala_aggregate_functions.xml">
+ <topicref href="topics/impala_appx_median.xml"/>
+ <topicref href="topics/impala_avg.xml"/>
+ <topicref href="topics/impala_count.xml"/>
+ <topicref href="topics/impala_group_concat.xml"/>
+ <topicref href="topics/impala_max.xml"/>
+ <topicref href="topics/impala_min.xml"/>
+ <topicref href="topics/impala_ndv.xml"/>
+ <topicref href="topics/impala_stddev.xml"/>
+ <topicref href="topics/impala_sum.xml"/>
+ <topicref href="topics/impala_variance.xml"/>
+ </topicref>
+ <topicref href="topics/impala_analytic_functions.xml"/>
+ <topicref href="topics/impala_udf.xml"/>
+ </topicref>
+ <topicref href="topics/impala_langref_unsupported.xml"/>
+ <topicref href="topics/impala_porting.xml"/>
+</map>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/shared/ImpalaVariables.xml
----------------------------------------------------------------------
diff --git a/docs/shared/ImpalaVariables.xml b/docs/shared/ImpalaVariables.xml
new file mode 100644
index 0000000..226eee9
--- /dev/null
+++ b/docs/shared/ImpalaVariables.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept xmlns:ditaarch="http://dita.oasis-open.org/architecture/2005/" id="impala_vars" ditaarch:DITAArchVersion="1.2" domains="(topic concept) (topic hi-d) (topic ut-d) (topic indexing-d) (topic hazard-d) (topic abbrev-d) (topic pr-d) (topic sw-d) (topic ui-d) " xml:lang="en-US">
+ <title>Cloudera Impala Variables</title>
+ <prolog id="prolog_slg_nmv_km">
+ <metadata id="metadata_ecq_qmv_km">
+ <prodinfo id="prodinfo_pmw_tmv_km">
+ <prodname>Apache Impala (incubating)</prodname>
+ <vrmlist>
+ <vrm version="Impala 2.3.x (separated)" id="vrm_pj3_3hv_impala"/>
+ <vrm version="CDH 5.5. (separated)" id="vrm_pj3_3hv_cdh"/>
+ </vrmlist>
+ </prodinfo>
+ </metadata>
+ </prolog>
+ <conbody>
+ <p>Release Version Variable - <ph id="ReleaseVersion">Impala 2.3.x / CDH 5.5.x (combined)</ph></p>
+ <p>Substitution variables for denoting features available in release X or higher.
+ The upstream docs can refer to the Impala release number.
+ The docs included with a distro can refer to the distro release number by
+ editing the values here.
+ <ul>
+ <li><ph id="impala26">CDH 5.8</ph></li>
+ <li><ph id="impala25">CDH 5.7</ph></li>
+ <li><ph id="impala24">CDH 5.6</ph></li>
+ <li><ph id="impala23">CDH 5.5</ph></li>
+ <li><ph id="impala22">CDH 5.4</ph></li>
+ <li><ph id="impala21">CDH 5.3</ph></li>
+ <li><ph id="impala20">CDH 5.2</ph></li>
+ <li><ph id="impala14">CDH 5.1</ph></li>
+ <li><ph id="impala13">CDH 5.0</ph></li>
+ </ul>
+ </p>
+ <p>Banner for examples showing shell version - <ph id="ShellBanner">(Shell
+ build version: Impala Shell v2.3.x (<varname>hash</varname>) built on
+ <varname>date</varname>)</ph></p>
+ <p>Banner for examples showing impalad version -<ph id="ImpaladBanner">Server version: impalad version 2.3.x (build
+ x.y.z)</ph></p>
+ <data name="version-message" id="version-message">
+ <foreign>
+ <lines xml:space="preserve">This is the documentation for <data name="version"/>.
+Documentation for other versions is available at <xref href="http://www.cloudera.com/content/support/en/documentation.html" scope="external" format="html">Cloudera Documentation</xref>.</lines>
+ </foreign>
+ </data>
+ </conbody>
+</concept>
[04/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Posted by jr...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_string_functions.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_string_functions.xml b/docs/topics/impala_string_functions.xml
new file mode 100644
index 0000000..a051ed5
--- /dev/null
+++ b/docs/topics/impala_string_functions.xml
@@ -0,0 +1,719 @@
+<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="string_functions">
+
+ <title>Impala String Functions</title>
+ <titlealts><navtitle>String Functions</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p rev="2.0.0">
+ String functions are classified as those primarily accepting or returning <codeph>STRING</codeph>,
+ <codeph>VARCHAR</codeph>, or <codeph>CHAR</codeph> data types, for example to measure the length of a string
+ or concatenate two strings together.
+ <ul>
+ <li>
+ All the functions that accept <codeph>STRING</codeph> arguments also accept the <codeph>VARCHAR</codeph>
+ and <codeph>CHAR</codeph> types introduced in Impala 2.0.
+ </li>
+
+ <li>
+ Whenever <codeph>VARCHAR</codeph> or <codeph>CHAR</codeph> values are passed to a function that returns a
+ string value, the return type is normalized to <codeph>STRING</codeph>. For example, a call to
+ <codeph>concat()</codeph> with a mix of <codeph>STRING</codeph>, <codeph>VARCHAR</codeph>, and
+ <codeph>CHAR</codeph> arguments produces a <codeph>STRING</codeph> result.
+ </li>
+ </ul>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ The string functions operate mainly on these data types: <xref href="impala_string.xml#string"/>,
+ <xref href="impala_varchar.xml#varchar"/>, and <xref href="impala_char.xml#char"/>.
+ </p>
+
+ <p>
+ <b>Function reference:</b>
+ </p>
+
+ <p>
+ Impala supports the following string functions:
+ </p>
+
+ <dl>
+ <dlentry id="ascii">
+
+ <dt>
+ <codeph>ascii(string str)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">ascii() function</indexterm>
+ <b>Purpose:</b> Returns the numeric ASCII code of the first character of the argument.
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="2.3.0" id="btrim">
+
+ <dt>
+ <codeph>btrim(string a)</codeph>,
+ <codeph>btrim(string a, string chars_to_trim)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">btrim() function</indexterm>
+ <b>Purpose:</b> Removes all instances of one or more characters
+ from the start and end of a <codeph>STRING</codeph> value.
+ By default, removes only spaces.
+ If a non-<codeph>NULL</codeph> optional second argument is specified, the function removes all
+ occurrences of characters in that second argument from the beginning and
+ end of the string.
+ <p><b>Return type:</b> <codeph>string</codeph></p>
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p>
+ The following examples show the default <codeph>btrim()</codeph> behavior,
+ and what changes when you specify the optional second argument.
+ All the examples bracket the output value with <codeph>[ ]</codeph>
+ so that you can see any leading or trailing spaces in the <codeph>btrim()</codeph> result.
+ By default, the function removes and number of both leading and trailing spaces.
+ When the second argument is specified, any number of occurrences of any
+ character in the second argument are removed from the start and end of the
+ input string; in this case, spaces are not removed (unless they are part of the second
+ argument) and any instances of the characters are not removed if they do not come
+ right at the beginning or end of the string.
+ </p>
+<codeblock>-- Remove multiple spaces before and one space after.
+select concat('[',btrim(' hello '),']');
++---------------------------------------+
+| concat('[', btrim(' hello '), ']') |
++---------------------------------------+
+| [hello] |
++---------------------------------------+
+
+-- Remove any instances of x or y or z at beginning or end. Leave spaces alone.
+select concat('[',btrim('xy hello zyzzxx','xyz'),']');
++------------------------------------------------------+
+| concat('[', btrim('xy hello zyzzxx', 'xyz'), ']') |
++------------------------------------------------------+
+| [ hello ] |
++------------------------------------------------------+
+
+-- Remove any instances of x or y or z at beginning or end.
+-- Leave x, y, z alone in the middle of the string.
+select concat('[',btrim('xyhelxyzlozyzzxx','xyz'),']');
++----------------------------------------------------+
+| concat('[', btrim('xyhelxyzlozyzzxx', 'xyz'), ']') |
++----------------------------------------------------+
+| [helxyzlo] |
++----------------------------------------------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="char_length">
+
+ <dt>
+ <codeph>char_length(string a), <ph rev="1.3.0" id="character_length">character_length(string a)</ph></codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">char_length() function</indexterm>
+ <indexterm audience="Cloudera">character_length() function</indexterm>
+ <b>Purpose:</b> Returns the length in characters of the argument string. Aliases for the
+ <codeph>length()</codeph> function.
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="2.3.0" id="chr">
+
+ <dt>
+ <codeph>chr(int character_code)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">chr() function</indexterm>
+ <b>Purpose:</b> Returns a character specified by a decimal code point value.
+ The interpretation and display of the resulting character depends on your system locale.
+ Because consistent processing of Impala string values is only guaranteed
+ for values within the ASCII range, only use this function for values
+ corresponding to ASCII characters.
+ In particular, parameter values greater than 255 return an empty string.
+ <p><b>Return type:</b> <codeph>string</codeph></p>
+ <p>
+ <b>Usage notes:</b> Can be used as the inverse of the <codeph>ascii()</codeph> function, which
+ converts a character to its numeric ASCII code.
+ </p>
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+<codeblock>SELECT chr(65);
++---------+
+| chr(65) |
++---------+
+| A |
++---------+
+
+SELECT chr(97);
++---------+
+| chr(97) |
++---------+
+| a |
++---------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="concat">
+
+ <dt>
+ <codeph>concat(string a, string b...)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">concat() function</indexterm>
+ <b>Purpose:</b> Returns a single string representing all the argument values joined together.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ <p conref="../shared/impala_common.xml#common/concat_blurb"/>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="concat_ws">
+
+ <dt>
+ <codeph>concat_ws(string sep, string a, string b...)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">concat_ws() function</indexterm>
+ <b>Purpose:</b> Returns a single string representing the second and following argument values joined
+ together, delimited by a specified separator.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ <p conref="../shared/impala_common.xml#common/concat_blurb"/>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="find_in_set">
+
+ <dt>
+ <codeph>find_in_set(string str, string strList)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">find_in_set() function</indexterm>
+ <b>Purpose:</b> Returns the position (starting from 1) of the first occurrence of a specified string
+ within a comma-separated string. Returns <codeph>NULL</codeph> if either argument is
+ <codeph>NULL</codeph>, 0 if the search string is not found, or 0 if the search string contains a comma.
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.2" id="group_concat">
+
+ <dt>
+ <codeph>group_concat(string s [, string sep])</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">group_concat() function</indexterm>
+ <b>Purpose:</b> Returns a single string representing the argument value concatenated together for each
+ row of the result set. If the optional separator string is specified, the separator is added between each
+ pair of concatenated values.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ <p conref="../shared/impala_common.xml#common/concat_blurb"/>
+ <p>
+ By default, returns a single string covering the whole result set. To include other columns or values
+ in the result set, or to produce multiple concatenated strings for subsets of rows, include a
+ <codeph>GROUP BY</codeph> clause in the query.
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.2" id="initcap">
+
+ <dt>
+ <codeph>initcap(string str)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">initcap() function</indexterm>
+ <b>Purpose:</b> Returns the input string with the first letter capitalized.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="instr">
+
+ <dt>
+ <codeph>instr(string str, string substr)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">instr() function</indexterm>
+ <b>Purpose:</b> Returns the position (starting from 1) of the first occurrence of a substring within a
+ longer string.
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="length">
+
+ <dt>
+ <codeph>length(string a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">length() function</indexterm>
+ <b>Purpose:</b> Returns the length in characters of the argument string.
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="locate">
+
+ <dt>
+ <codeph>locate(string substr, string str[, int pos])</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">locate() function</indexterm>
+ <b>Purpose:</b> Returns the position (starting from 1) of the first occurrence of a substring within a
+ longer string, optionally after a particular position.
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="lower">
+
+ <dt>
+ <codeph>lower(string a), <ph id="lcase">lcase(string a)</ph> </codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">lower() function</indexterm>
+ <b>Purpose:</b> Returns the argument string converted to all-lowercase.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="lpad">
+
+ <dt>
+ <codeph>lpad(string str, int len, string pad)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">lpad() function</indexterm>
+ <b>Purpose:</b> Returns a string of a specified length, based on the first argument string. If the
+ specified string is too short, it is padded on the left with a repeating sequence of the characters from
+ the pad string. If the specified string is too long, it is truncated on the right.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="ltrim">
+
+ <dt>
+ <codeph>ltrim(string a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">ltrim() function</indexterm>
+ <b>Purpose:</b> Returns the argument string with any leading spaces removed from the left side.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="parse_url">
+
+ <dt>
+ <codeph>parse_url(string urlString, string partToExtract [, string keyToExtract])</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">parse_url() function</indexterm>
+ <b>Purpose:</b> Returns the portion of a URL corresponding to a specified part. The part argument can be
+ <codeph>'PROTOCOL'</codeph>, <codeph>'HOST'</codeph>, <codeph>'PATH'</codeph>, <codeph>'REF'</codeph>,
+ <codeph>'AUTHORITY'</codeph>, <codeph>'FILE'</codeph>, <codeph>'USERINFO'</codeph>, or
+ <codeph>'QUERY'</codeph>. Uppercase is required for these literal values. When requesting the
+ <codeph>QUERY</codeph> portion of the URL, you can optionally specify a key to retrieve just the
+ associated value from the key-value pairs in the query string.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ <p>
+ <b>Usage notes:</b> This function is important for the traditional Hadoop use case of interpreting web
+ logs. For example, if the web traffic data features raw URLs not divided into separate table columns,
+ you can count visitors to a particular page by extracting the <codeph>'PATH'</codeph> or
+ <codeph>'FILE'</codeph> field, or analyze search terms by extracting the corresponding key from the
+ <codeph>'QUERY'</codeph> field.
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="regexp_extract">
+
+ <dt>
+ <codeph>regexp_extract(string subject, string pattern, int index)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">regexp_extract() function</indexterm>
+ <b>Purpose:</b> Returns the specified () group from a string based on a regular expression pattern. Group
+ 0 refers to the entire extracted string, while group 1, 2, and so on refers to the first, second, and so
+ on <codeph>(...)</codeph> portion.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ <p conref="../shared/impala_common.xml#common/regexp_re2"/>
+ <p conref="../shared/impala_common.xml#common/regexp_re2_warning"/>
+ <p conref="../shared/impala_common.xml#common/regexp_escapes"/>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p>
+ This example shows how group 0 matches the full pattern string, including the portion outside any
+ <codeph>()</codeph> group:
+ </p>
+<codeblock>[localhost:21000] > select regexp_extract('abcdef123ghi456jkl','.*?(\\d+)',0);
++------------------------------------------------------+
+| regexp_extract('abcdef123ghi456jkl', '.*?(\\d+)', 0) |
++------------------------------------------------------+
+| abcdef123ghi456 |
++------------------------------------------------------+
+Returned 1 row(s) in 0.11s</codeblock>
+ <p>
+ This example shows how group 1 matches just the contents inside the first <codeph>()</codeph> group in
+ the pattern string:
+ </p>
+<codeblock>[localhost:21000] > select regexp_extract('abcdef123ghi456jkl','.*?(\\d+)',1);
++------------------------------------------------------+
+| regexp_extract('abcdef123ghi456jkl', '.*?(\\d+)', 1) |
++------------------------------------------------------+
+| 456 |
++------------------------------------------------------+
+Returned 1 row(s) in 0.11s</codeblock>
+ <p rev="2.0.0">
+ Unlike in earlier Impala releases, the regular expression library used in Impala 2.0 and later supports
+ the <codeph>.*?</codeph> idiom for non-greedy matches. This example shows how a pattern string starting
+ with <codeph>.*?</codeph> matches the shortest possible portion of the source string, returning the
+ rightmost set of lowercase letters. A pattern string both starting and ending with <codeph>.*?</codeph>
+ finds two potential matches of equal length, and returns the first one found (the leftmost set of
+ lowercase letters).
+ </p>
+<codeblock>[localhost:21000] > select regexp_extract('AbcdBCdefGHI','.*?([[:lower:]]+)',1);
++--------------------------------------------------------+
+| regexp_extract('abcdbcdefghi', '.*?([[:lower:]]+)', 1) |
++--------------------------------------------------------+
+| def |
++--------------------------------------------------------+
+[localhost:21000] > select regexp_extract('AbcdBCdefGHI','.*?([[:lower:]]+).*?',1);
++-----------------------------------------------------------+
+| regexp_extract('abcdbcdefghi', '.*?([[:lower:]]+).*?', 1) |
++-----------------------------------------------------------+
+| bcd |
++-----------------------------------------------------------+
+</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="regexp_replace">
+
+ <dt>
+ <codeph>regexp_replace(string initial, string pattern, string replacement)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">regexp_replace() function</indexterm>
+ <b>Purpose:</b> Returns the initial argument with the regular expression pattern replaced by the final
+ argument string.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ <p conref="../shared/impala_common.xml#common/regexp_re2"/>
+ <p conref="../shared/impala_common.xml#common/regexp_re2_warning"/>
+ <p conref="../shared/impala_common.xml#common/regexp_escapes"/>
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p>
+ These examples show how you can replace parts of a string matching a pattern with replacement text,
+ which can include backreferences to any <codeph>()</codeph> groups in the pattern string. The
+ backreference numbers start at 1, and any <codeph>\</codeph> characters must be escaped as
+ <codeph>\\</codeph>.
+ </p>
+ <p>
+ Replace a character pattern with new text:
+ </p>
+<codeblock>[localhost:21000] > select regexp_replace('aaabbbaaa','b+','xyz');
++------------------------------------------+
+| regexp_replace('aaabbbaaa', 'b+', 'xyz') |
++------------------------------------------+
+| aaaxyzaaa |
++------------------------------------------+
+Returned 1 row(s) in 0.11s</codeblock>
+ <p>
+ Replace a character pattern with substitution text that includes the original matching text:
+ </p>
+<codeblock>[localhost:21000] > select regexp_replace('aaabbbaaa','(b+)','<\\1>');
++----------------------------------------------+
+| regexp_replace('aaabbbaaa', '(b+)', '<\\1>') |
++----------------------------------------------+
+| aaa<bbb>aaa |
++----------------------------------------------+
+Returned 1 row(s) in 0.11s</codeblock>
+ <p>
+ Remove all characters that are not digits:
+ </p>
+<codeblock>[localhost:21000] > select regexp_replace('123-456-789','[^[:digit:]]','');
++---------------------------------------------------+
+| regexp_replace('123-456-789', '[^[:digit:]]', '') |
++---------------------------------------------------+
+| 123456789 |
++---------------------------------------------------+
+Returned 1 row(s) in 0.12s</codeblock>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="repeat">
+
+ <dt>
+ <codeph>repeat(string str, int n)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">repeat() function</indexterm>
+ <b>Purpose:</b> Returns the argument string repeated a specified number of times.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="reverse">
+
+ <dt>
+ <codeph>reverse(string a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">reverse() function</indexterm>
+ <b>Purpose:</b> Returns the argument string with characters in reversed order.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="rpad">
+
+ <dt>
+ <codeph>rpad(string str, int len, string pad)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">rpad() function</indexterm>
+ <b>Purpose:</b> Returns a string of a specified length, based on the first argument string. If the
+ specified string is too short, it is padded on the right with a repeating sequence of the characters from
+ the pad string. If the specified string is too long, it is truncated on the right.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="rtrim">
+
+ <dt>
+ <codeph>rtrim(string a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">rtrim() function</indexterm>
+ <b>Purpose:</b> Returns the argument string with any trailing spaces removed from the right side.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="space">
+
+ <dt>
+ <codeph>space(int n)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">space() function</indexterm>
+ <b>Purpose:</b> Returns a concatenated string of the specified number of spaces. Shorthand for
+ <codeph>repeat(' ',<varname>n</varname>)</codeph>.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="strleft">
+
+ <dt>
+ <codeph>strleft(string a, int num_chars)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">strleft() function</indexterm>
+ <b>Purpose:</b> Returns the leftmost characters of the string. Shorthand for a call to
+ <codeph>substr()</codeph> with 2 arguments.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="strright">
+
+ <dt>
+ <codeph>strright(string a, int num_chars)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">strright() function</indexterm>
+ <b>Purpose:</b> Returns the rightmost characters of the string. Shorthand for a call to
+ <codeph>substr()</codeph> with 2 arguments.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="substr">
+
+ <dt>
+ <codeph>substr(string a, int start [, int len]), <ph id="substring">substring(string a, int start [, int
+ len])</ph></codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">substr() function</indexterm>
+ <b>Purpose:</b> Returns the portion of the string starting at a specified point, optionally with a
+ specified maximum length. The characters in the string are indexed starting at 1.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="translate">
+
+ <dt>
+ <codeph>translate(string input, string from, string to)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">translate() function</indexterm>
+ <b>Purpose:</b> Returns the input string with a set of characters replaced by another set of characters.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="trim">
+
+ <dt>
+ <codeph>trim(string a)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">trim() function</indexterm>
+ <b>Purpose:</b> Returns the input string with both leading and trailing spaces removed. The same as
+ passing the string through both <codeph>ltrim()</codeph> and <codeph>rtrim()</codeph>.
+ <p>
+ <b>Usage notes:</b> Often used during data cleansing operations during the ETL cycle, if input values might still have surrounding spaces.
+ For a more general-purpose function that can remove other leading and trailing characters besides spaces, see <codeph>btrim()</codeph>.
+ </p>
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="upper">
+
+ <dt>
+ <codeph>upper(string a), <ph id="ucase">ucase(string a)</ph></codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">upper() function</indexterm>
+ <indexterm audience="Cloudera">ucase() function</indexterm>
+ <b>Purpose:</b> Returns the argument string converted to all-uppercase.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+ </dl>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_struct.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_struct.xml b/docs/topics/impala_struct.xml
new file mode 100644
index 0000000..1e440fc
--- /dev/null
+++ b/docs/topics/impala_struct.xml
@@ -0,0 +1,406 @@
+<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+
+ <concept id="struct">
+
+ <title>STRUCT Complex Type (CDH 5.5 or higher only)</title>
+
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Data Types"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ A complex data type, representing multiple fields of a single item.
+ Frequently used as the element type of an <codeph>ARRAY</codeph>
+ or the <codeph>VALUE</codeph> part of a <codeph>MAP</codeph>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock><varname>column_name</varname> STRUCT < <varname>name</varname> : <varname>type</varname> [COMMENT '<varname>comment_string</varname>'], ... >
+
+type ::= <varname>primitive_type</varname> | <varname>complex_type</varname>
+</codeblock>
+
+ <p>
+ The names and number of fields within the <codeph>STRUCT</codeph> are fixed. Each field can be a different type.
+ A field within a <codeph>STRUCT</codeph> can also be another <codeph>STRUCT</codeph>, or an <codeph>ARRAY</codeph>
+ or a <codeph>MAP</codeph>, allowing you to create nested data structures with a maximum nesting depth of 100.
+ </p>
+
+ <p>
+ A <codeph>STRUCT</codeph> can be the top-level type for a column, or can itself be an item within an <codeph>ARRAY</codeph>
+ or the value part of the key-value pair in a <codeph>MAP</codeph>.
+ </p>
+
+ <p>
+ When a <codeph>STRUCT</codeph> is used as an <codeph>ARRAY</codeph> element or a <codeph>MAP</codeph> value,
+ you use a join clause to bring the <codeph>ARRAY</codeph> or <codeph>MAP</codeph> elements into the result set, and then refer
+ to <codeph><varname>array_name</varname>.ITEM.<varname>field</varname></codeph> or
+ <codeph><varname>map_name</varname>.VALUE.<varname>field</varname></codeph>.
+ In the case of a <codeph>STRUCT</codeph> directly inside an <codeph>ARRAY</codeph> or <codeph>MAP</codeph>,
+ you can omit the <codeph>.ITEM</codeph> and <codeph>.VALUE</codeph> pseudocolumns and refer directly to
+ <codeph><varname>array_name</varname>.<varname>field</varname></codeph> or
+ <codeph><varname>map_name</varname>.<varname>field</varname></codeph>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_combo"/>
+
+ <p>
+ A <codeph>STRUCT</codeph> is similar conceptually to a table row: it contains a fixed number of named fields,
+ each with a predefined type. To combine two related tables, while using complex types to
+ minimize repetition, the typical way to represent that data is as an <codeph>ARRAY</codeph> of <codeph>STRUCT</codeph> elements.
+ </p>
+
+ <p>
+ Because a <codeph>STRUCT</codeph> has a fixed number of named fields, it typically does not make sense
+ to have a <codeph>STRUCT</codeph> as the type of a table column. In such a case, you could just make each field of the
+ <codeph>STRUCT</codeph> into a separate column of the table. The <codeph>STRUCT</codeph> type is most
+ useful as an item of an <codeph>ARRAY</codeph> or the value part of the key-value pair in a <codeph>MAP</codeph>.
+ A nested type column with a <codeph>STRUCT</codeph> at the lowest level lets you associate a variable
+ number of row-like objects with each row of the table.
+ </p>
+
+ <p>
+ The <codeph>STRUCT</codeph> type is straightforward to reference within a query. You do not need to
+ include the <codeph>STRUCT</codeph> column in a join clause or give it a table alias, as is
+ required for the <codeph>ARRAY</codeph> and <codeph>MAP</codeph> types. You refer to the individual
+ fields using dot notation, such as <codeph><varname>struct_column_name</varname>.<varname>field_name</varname></codeph>,
+ without any pseudocolumn such as <codeph>ITEM</codeph> or <codeph>VALUE</codeph>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_describe"/>
+
+ <p conref="../shared/impala_common.xml#common/internals_blurb"/>
+
+ <p>
+ Within the Parquet data file, the values for each <codeph>STRUCT</codeph> field are stored adjacent to each other,
+ so that they can be encoded and compressed using all the Parquet techniques for storing sets of similar or
+ repeated values. The adjacency applies even when the <codeph>STRUCT</codeph> values are part of an
+ <codeph>ARRAY</codeph> or <codeph>MAP</codeph>. During a query, Impala avoids unnecessary I/O by reading only the portions
+ of the Parquet data file containing the requested <codeph>STRUCT</codeph> fields.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/added_in_230"/>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <ul conref="../shared/impala_common.xml#common/complex_types_restrictions">
+ <li/>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <note conref="../shared/impala_common.xml#common/complex_type_schema_pointer"/>
+
+ <p>
+ The following example shows a table with various kinds of <codeph>STRUCT</codeph> columns,
+ both at the top level and nested within other complex types.
+ Practice the <codeph>CREATE TABLE</codeph> and query notation for complex type columns
+ using empty tables, until you can visualize a complex data structure and construct corresponding SQL statements reliably.
+ </p>
+
+<codeblock><![CDATA[CREATE TABLE struct_demo
+(
+ id BIGINT,
+ name STRING,
+
+-- A STRUCT as a top-level column. Demonstrates how the table ID column
+-- and the ID field within the STRUCT can coexist without a name conflict.
+ employee_info STRUCT < employer: STRING, id: BIGINT, address: STRING >,
+
+-- A STRUCT as the element type of an ARRAY.
+ places_lived ARRAY < STRUCT <street: STRING, city: STRING, country: STRING >>,
+
+-- A STRUCT as the value portion of the key-value pairs in a MAP.
+ memorable_moments MAP < STRING, STRUCT < year: INT, place: STRING, details: STRING >>,
+
+-- A STRUCT where one of the fields is another STRUCT.
+ current_address STRUCT < street_address: STRUCT <street_number: INT, street_name: STRING, street_type: STRING>, country: STRING, postal_code: STRING >
+)
+STORED AS PARQUET;
+]]>
+</codeblock>
+
+ <p>
+ The following example shows how to examine the structure of a table containing one or more
+ <codeph>STRUCT</codeph> columns by using the <codeph>DESCRIBE</codeph> statement. You can
+ visualize each <codeph>STRUCT</codeph> as its own table, with columns
+ named the same as each field of the <codeph>STRUCT</codeph>.
+ If the <codeph>STRUCT</codeph> is nested inside another complex type, such as <codeph>ARRAY</codeph>,
+ you can extend the qualified name passed to <codeph>DESCRIBE</codeph> until the output
+ shows just the <codeph>STRUCT</codeph> fields.
+ </p>
+
+<codeblock><![CDATA[DESCRIBE struct_demo;
++-------------------+--------------------------+
+| name | type |
++-------------------+--------------------------+
+| id | bigint |
+| name | string |
+| employee_info | struct< |
+| | employer:string, |
+| | id:bigint, |
+| | address:string |
+| | > |
+| places_lived | array<struct< |
+| | street:string, |
+| | city:string, |
+| | country:string |
+| | >> |
+| memorable_moments | map<string,struct< |
+| | year:int, |
+| | place:string, |
+| | details:string |
+| | >> |
+| current_address | struct< |
+| | street_address:struct< |
+| | street_number:int, |
+| | street_name:string, |
+| | street_type:string |
+| | >, |
+| | country:string, |
+| | postal_code:string |
+| | > |
++-------------------+--------------------------+
+
+DESCRIBE struct_demo.employee_info;
++----------+--------+
+| name | type |
++----------+--------+
+| employer | string |
+| id | bigint |
+| address | string |
++----------+--------+
+
+-- Because PLACES_LIVED is a STRUCT inside an ARRAY, the
+-- initial DESCRIBE shows the structure of the ARRAY.
+DESCRIBE struct_demo.places_lived;
++------+------------------+
+| name | type |
++------+------------------+
+| item | struct< |
+| | street:string, |
+| | city:string, |
+| | country:string |
+| | > |
+| pos | bigint |
++------+------------------+
+
+-- Ask for the details of the ITEM field of the ARRAY to see
+-- just the layout of the STRUCT.
+DESCRIBE struct_demo.places_lived.item;
++---------+--------+
+| name | type |
++---------+--------+
+| street | string |
+| city | string |
+| country | string |
++---------+--------+
+
+-- Likewise, MEMORABLE_MOMENTS has a STRUCT inside a MAP,
+-- which requires an extra level of qualified name to see
+-- just the STRUCT part.
+DESCRIBE struct_demo.memorable_moments;
++-------+------------------+
+| name | type |
++-------+------------------+
+| key | string |
+| value | struct< |
+| | year:int, |
+| | place:string, |
+| | details:string |
+| | > |
++-------+------------------+
+
+-- For a MAP, ask to see the VALUE field to see the
+-- corresponding STRUCT fields in a table-like structure.
+DESCRIBE struct_demo.memorable_moments.value;
++---------+--------+
+| name | type |
++---------+--------+
+| year | int |
+| place | string |
+| details | string |
++---------+--------+
+
+-- For a STRUCT inside a STRUCT, we can see the fields of the
+-- outer STRUCT...
+DESCRIBE struct_demo.current_address;
++----------------+-----------------------+
+| name | type |
++----------------+-----------------------+
+| street_address | struct< |
+| | street_number:int, |
+| | street_name:string, |
+| | street_type:string |
+| | > |
+| country | string |
+| postal_code | string |
++----------------+-----------------------+
+
+-- ...and then use a further qualified name to see just the
+-- fields of the inner STRUCT.
+DESCRIBE struct_demo.current_address.street_address;
++---------------+--------+
+| name | type |
++---------------+--------+
+| street_number | int |
+| street_name | string |
+| street_type | string |
++---------------+--------+
+]]>
+</codeblock>
+
+ <p>
+ The following example shows how to examine the structure of a table containing one or more
+ <codeph>STRUCT</codeph> columns by using the <codeph>DESCRIBE</codeph> statement. You can
+ visualize each <codeph>STRUCT</codeph> as its own table, with columns
+ named the same as each field of the <codeph>STRUCT</codeph>.
+ If the <codeph>STRUCT</codeph> is nested inside another complex type, such as <codeph>ARRAY</codeph>,
+ you can extend the qualified name passed to <codeph>DESCRIBE</codeph> until the output
+ shows just the <codeph>STRUCT</codeph> fields.
+ </p>
+
+<!-- To do: See why the most verbose query form gives an error. -->
+<codeblock><![CDATA[DESCRIBE struct_demo;
++-------------------+--------------------------+---------+
+| name | type | comment |
++-------------------+--------------------------+---------+
+| id | bigint | |
+| name | string | |
+| employee_info | struct< | |
+| | employer:string, | |
+| | id:bigint, | |
+| | address:string | |
+| | > | |
+| places_lived | array<struct< | |
+| | street:string, | |
+| | city:string, | |
+| | country:string | |
+| | >> | |
+| memorable_moments | map<string,struct< | |
+| | year:int, | |
+| | place:string, | |
+| | details:string | |
+| | >> | |
+| current_address | struct< | |
+| | street_address:struct< | |
+| | street_number:int, | |
+| | street_name:string, | |
+| | street_type:string | |
+| | >, | |
+| | country:string, | |
+| | postal_code:string | |
+| | > | |
++-------------------+--------------------------+---------+
+
+SELECT id, employee_info.id FROM struct_demo;
+
+SELECT id, employee_info.id AS employee_id FROM struct_demo;
+
+SELECT id, employee_info.id AS employee_id, employee_info.employer
+ FROM struct_demo;
+
+SELECT id, name, street, city, country
+ FROM struct_demo, struct_demo.places_lived;
+
+SELECT id, name, struct_demo.places_lived.pos, struct_demo.places_lived.street, struct_demo.places_lived.city, struct_demo.places_lived.country
+ FROM struct_demo, struct_demo.places_lived;
+ERROR: AnalysisException: Illegal column/field reference 'struct_demo.places_lived.pos' with intermediate collection 'places_lived' of type 'ARRAY<STRUCT<street:STRING,city:STRING,country:STRING>>'
+
+SELECT id, name, pl.pos, pl.street, pl.city, pl.country
+ FROM struct_demo, struct_demo.places_lived AS pl;
+
+SELECT id, name, places_lived.pos, places_lived.street, places_lived.city, places_lived.country
+ FROM struct_demo, struct_demo.places_lived;
+
+SELECT id, name, pos, street, city, country
+ FROM struct_demo, struct_demo.places_lived;
+
+SELECT id, name, struct_demo.memorable_moments.key,
+ struct_demo.memorable_moments.value.year,
+ struct_demo.memorable_moments.value.place,
+ struct_demo.memorable_moments.value.details
+FROM struct_demo, struct_demo.memorable_moments
+WHERE struct_demo.memorable_moments.key IN ('Birthday','Anniversary','Graduation');
+ERROR: AnalysisException: Illegal column/field reference 'struct_demo.memorable_moments.key' with intermediate collection 'memorable_moments' of type 'MAP<STRING,STRUCT<year:INT,place:STRING,details:STRING>>'
+
+SELECT id, name, mm.key, mm.value.year, mm.value.place, mm.value.details
+ FROM struct_demo, struct_demo.memorable_moments AS mm
+WHERE mm.key IN ('Birthday','Anniversary','Graduation');
+
+SELECT id, name, memorable_moments.key, memorable_moments.value.year,
+ memorable_moments.value.place, memorable_moments.value.details
+FROM struct_demo, struct_demo.memorable_moments
+WHERE key IN ('Birthday','Anniversary','Graduation');
+
+SELECT id, name, key, value.year, value.place, value.details
+ FROM struct_demo, struct_demo.memorable_moments
+WHERE key IN ('Birthday','Anniversary','Graduation');
+
+SELECT id, name, key, year, place, details
+ FROM struct_demo, struct_demo.memorable_moments
+WHERE key IN ('Birthday','Anniversary','Graduation');
+
+SELECT id, name,
+ current_address.street_address.street_number,
+ current_address.street_address.street_name,
+ current_address.street_address.street_type,
+ current_address.country,
+ current_address.postal_code
+FROM struct_demo;
+]]>
+</codeblock>
+
+ <p>
+ For example, this table uses a struct that encodes several data values for each phone number associated
+ with a person. Each person can have a variable-length array of associated phone numbers, and queries can
+ refer to the category field to locate specific home, work, mobile, and so on kinds of phone numbers.
+ </p>
+
+<codeblock>CREATE TABLE contact_info_many_structs
+(
+ id BIGINT, name STRING,
+ phone_numbers ARRAY < STRUCT <category:STRING, country_code:STRING, area_code:SMALLINT, full_number:STRING, mobile:BOOLEAN, carrier:STRING > >
+) STORED AS PARQUET;
+</codeblock>
+
+ <p>
+ Because structs are naturally suited to composite values where the fields have different data types, you might use them
+ to decompose things such as addresses:
+ </p>
+
+<codeblock>CREATE TABLE contact_info_detailed_address
+(
+ id BIGINT, name STRING,
+ address STRUCT < house_number:INT, street:STRING, street_type:STRING, apartment:STRING, city:STRING, region:STRING, country:STRING >
+);
+</codeblock>
+
+ <p>
+ In a big data context, splitting out data fields such as the number part of the address and the street name
+ could let you do analysis on each field independently. For example, which streets have the largest number
+ range of addresses, what are the statistical properties of the street names, which areas have a higher
+ proportion of <q>Roads</q>, <q>Courts</q> or <q>Boulevards</q>, and so on.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_complex_types.xml#complex_types"/>,
+ <xref href="impala_array.xml#array"/>,
+ <!-- <xref href="impala_struct.xml#struct"/>, -->
+ <xref href="impala_map.xml#map"/>
+ </p>
+
+ </conbody>
+
+ </concept>
+
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_subqueries.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_subqueries.xml b/docs/topics/impala_subqueries.xml
new file mode 100644
index 0000000..ed99f3a
--- /dev/null
+++ b/docs/topics/impala_subqueries.xml
@@ -0,0 +1,318 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="2.0.0" id="subqueries">
+
+ <title>Subqueries in Impala SELECT Statements</title>
+ <titlealts><navtitle>Subqueries</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">subqueries</indexterm>
+ A <term>subquery</term> is a query that is nested within another query. Subqueries let queries on one table
+ dynamically adapt based on the contents of another table. This technique provides great flexibility and
+ expressive power for SQL queries.
+ </p>
+
+ <p>
+ A subquery can return a result set for use in the <codeph>FROM</codeph> or <codeph>WITH</codeph> clauses, or
+ with operators such as <codeph>IN</codeph> or <codeph>EXISTS</codeph>.
+ </p>
+
+ <p>
+ A <term>scalar subquery</term> produces a result set with a single row containing a single column, typically
+ produced by an aggregation function such as <codeph>MAX()</codeph> or <codeph>SUM()</codeph>. This single
+ result value can be substituted in scalar contexts such as arguments to comparison operators. If the result
+ set is empty, the value of the scalar subquery is <codeph>NULL</codeph>. For example, the following query
+ finds the maximum value of <codeph>T2.Y</codeph> and then substitutes that value into the
+ <codeph>WHERE</codeph> clause of the outer block that queries <codeph>T1</codeph>:
+ </p>
+
+<codeblock>SELECT x FROM t1 WHERE x > (SELECT MAX(y) FROM t2);
+</codeblock>
+
+ <p>
+ <term>Uncorrelated subqueries</term> do not refer to any tables from the outer block of the query. The same
+ value or set of values produced by the subquery is used when evaluating each row from the outer query block.
+ In this example, the subquery returns an arbitrary number of values from <codeph>T2.Y</codeph>, and each
+ value of <codeph>T1.X</codeph> is tested for membership in that same set of values:
+ </p>
+
+<codeblock>SELECT x FROM t1 WHERE x IN (SELECT y FROM t2);
+</codeblock>
+
+ <p>
+ <term>Correlated subqueries</term> compare one or more values from the outer query block to values referenced
+ in the <codeph>WHERE</codeph> clause of the subquery. Each row evaluated by the outer <codeph>WHERE</codeph>
+ clause can be evaluated using a different set of values. These kinds of subqueries are restricted in the
+ kinds of comparisons they can do between columns of the inner and outer tables. (See the following
+ <b>Restrictions</b> item.)
+ </p>
+
+ <p>
+ For example, the following query finds all the employees with salaries that are higher than average for their
+ department. The subquery potentially computes a different <codeph>AVG()</codeph> value for each employee.
+ </p>
+
+<!-- TK: Construct an EMPLOYEES schema to try out examples like these. -->
+
+<codeblock>SELECT employee_name, employee_id FROM employees one WHERE
+ salary > (SELECT avg(salary) FROM employees two WHERE one.dept_id = two.dept_id);
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+ <p>
+ <b>Subquery in the <codeph>FROM</codeph> clause:</b>
+ </p>
+
+<codeblock>SELECT <varname>select_list</varname> FROM <varname>table_ref</varname> [, <varname>table_ref</varname> ...]
+
+<varname>table_ref</varname> ::= <varname>table_name</varname> | (<varname>select_statement</varname>)
+</codeblock>
+
+ <p>
+ <b>Subqueries in <codeph>WHERE</codeph> clause:</b>
+ </p>
+
+<codeblock>WHERE <varname>value</varname> <varname>comparison_operator</varname> (<varname>scalar_select_statement</varname>)
+WHERE <varname>value</varname> [NOT] IN (<varname>select_statement</varname>)
+WHERE [NOT] EXISTS (<varname>correlated_select_statement</varname>)
+WHERE NOT EXISTS (<varname>correlated_select_statement</varname>)
+</codeblock>
+
+ <p>
+ <codeph>comparison_operator</codeph> is a numeric comparison such as <codeph>=</codeph>,
+ <codeph><=</codeph>, <codeph>!=</codeph>, and so on, or a string comparison operator such as
+ <codeph>LIKE</codeph> or <codeph>REGEXP</codeph>.
+ </p>
+
+ <p rev="2.0.0">
+ Although you can use non-equality comparison operators such as <codeph><</codeph> or
+ <codeph>>=</codeph>, the subquery must include at least one equality comparison between the columns of the
+ inner and outer query blocks.
+ </p>
+
+ <p>
+ All syntax is available for both correlated and uncorrelated queries, except that the <codeph>NOT
+ EXISTS</codeph> clause cannot be used with an uncorrelated subquery.
+ </p>
+
+ <p>
+ Impala subqueries can be nested arbitrarily deep.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/sql1999"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ This example illustrates how subqueries can be used in the <codeph>FROM</codeph> clause to organize the table
+ names, column names, and column values by producing intermediate result sets, especially for join queries.
+ </p>
+
+<codeblock>SELECT avg(t1.x), max(t2.y) FROM
+ (SELECT id, cast(a AS DECIMAL(10,5)) AS x FROM raw_data WHERE a BETWEEN 0 AND 100) AS t1
+ JOIN
+ (SELECT id, length(s) AS y FROM raw_data WHERE s LIKE 'A%') AS t2;
+ USING (id);
+</codeblock>
+
+ <p rev="2.0.0">
+ These examples show how a query can test for the existence of values in a separate table using the
+ <codeph>EXISTS()</codeph> operator with a subquery.
+<!--
+Internally, these queries are processed in a way similar to join queries.
+Because the values from the second table are not part of the result set, the subquery
+is more efficient than the equivalent join query.
+-->
+ </p>
+
+ <p>
+ The following examples show how a value can be compared against a set of values returned by a subquery.
+ </p>
+
+<codeblock rev="2.0.0">SELECT count(x) FROM t1 WHERE EXISTS(SELECT 1 FROM t2 WHERE t1.x = t2.y * 10);
+
+SELECT x FROM t1 WHERE x IN (SELECT y FROM t2 WHERE state = 'CA');
+</codeblock>
+
+ <p>
+ The following examples demonstrate scalar subqueries. When a subquery is known to return a single value, you
+ can substitute it where you would normally put a constant value.
+ </p>
+
+<codeblock>SELECT x FROM t1 WHERE y = (SELECT max(z) FROM t2);
+SELECT x FROM t1 WHERE y > (SELECT count(z) FROM t2);
+</codeblock>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/partitioning_blurb"/> -->
+
+<!--
+<p conref="/Content/impala_common_xi44078.xml#common/hbase_blurb"/>
+<p>
+Currently, the <codeph>IN (<varname>subquery</varname>)</codeph> operator results in a full table scan
+of an HBase table, rather than being translated into a series of single-row lookups.
+Therefore, this is not an efficient construct to use with Impala queries for HBase tables.
+</p>
+-->
+
+<!--
+<p conref="/Content/impala_common_xi44078.xml#common/parquet_blurb"/>
+<p conref="/Content/impala_common_xi44078.xml#common/text_blurb"/>
+<p conref="/Content/impala_common_xi44078.xml#common/compatibility_blurb"/>
+-->
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ If the same table is referenced in both the outer and inner query blocks, construct a table alias in the
+ outer query block and use a fully qualified name to distinguish the inner and outer table references:
+ </p>
+
+<!-- TK: verify the logic of this example. Probably have other similar ones that could be reused here. -->
+
+<codeblock>SELECT * FROM t1 one WHERE id IN (SELECT parent FROM t1 two WHERE t1.parent = t2.id);
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/internals_blurb"/>
+
+ <p>
+ Internally, subqueries involving <codeph>IN</codeph>, <codeph>NOT IN</codeph>, <codeph>EXISTS</codeph>, or
+ <codeph>NOT EXISTS</codeph> clauses are rewritten into join queries. Depending on the syntax, the subquery
+ might be rewritten to an outer join, semi join, cross join, or anti join.
+ </p>
+
+ <p>
+ A query is processed differently depending on whether the subquery calls any aggregation functions. There are
+ correlated and uncorrelated forms, with and without calls to aggregation functions. Each of these four
+ categories is rewritten differently.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/column_stats_blurb"/>
+
+ <p>
+ Because queries that include correlated and uncorrelated subqueries in the <codeph>WHERE</codeph> clause are
+ written into join queries, to achieve best performance, follow the same guidelines for running the
+ <codeph>COMPUTE STATS</codeph> statement as you do for tables involved in regular join queries. Run the
+ <codeph>COMPUTE STATS</codeph> statement for each associated tables after loading or substantially changing
+ the data in that table. See <xref href="impala_perf_stats.xml#perf_stats"/> for details.
+ </p>
+
+ <p>
+ <b>Added in:</b> Subqueries are substantially enhanced starting in Impala 2.0 for CDH 4, and CDH 5.2.0. Now,
+ they can be used in the <codeph>WHERE</codeph> clause, in combination with clauses such as
+ <codeph>EXISTS</codeph> and <codeph>IN</codeph>, rather than just in the <codeph>FROM</codeph> clause.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <p>
+ The initial Impala support for nested subqueries addresses the most common use cases. Some restrictions
+ remain:
+ </p>
+
+ <ul>
+ <li>
+ <p>
+ Although you can use subqueries in a query involving <codeph>UNION</codeph> or <codeph>UNION ALL</codeph>
+ in Impala 2.1.0 and higher, currently you cannot construct a union of two subqueries (for example, in the
+ argument of an <codeph>IN</codeph> or <codeph>EXISTS</codeph> operator).
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Subqueries returning scalar values cannot be used with the operators <codeph>ANY</codeph> or
+ <codeph>ALL</codeph>. (Impala does not currently have a <codeph>SOME</codeph> operator, but if it did,
+ the same restriction would apply.)
+ </p>
+ </li>
+
+ <li>
+ <p>
+ For the <codeph>EXISTS</codeph> and <codeph>NOT EXISTS</codeph> clauses, any subquery comparing values
+ from the outer query block to another table must use at least one equality comparison, not exclusively
+ other kinds of comparisons such as less than, greater than, <codeph>BETWEEN</codeph>, or
+ <codeph>!=</codeph>.
+ </p>
+ </li>
+
+ <li>
+<!-- TK: think this is no longer true. -->
+ <p>
+ Currently, a scalar subquery cannot be used as the first or second argument to the
+ <codeph>BETWEEN</codeph> operator.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ A subquery cannot be used inside an <codeph>OR</codeph> conjunction. Expressions inside a subquery, for
+ example in the <codeph>WHERE</codeph> clause, can use <codeph>OR</codeph> conjunctions; the restriction
+ only applies to parts of the query <q>above</q> the subquery.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Scalar subqueries are only supported in numeric contexts. You cannot use a scalar subquery as an argument
+ to the <codeph>LIKE</codeph>, <codeph>REGEXP</codeph>, or <codeph>RLIKE</codeph> operators, or compare it
+ to a value of a non-numeric type such as <codeph>TIMESTAMP</codeph> or <codeph>BOOLEAN</codeph>.
+ </p>
+ </li>
+
+ <li>
+ <p>
+<!-- A subquery cannot be used to generate a scalar value for a function call. -->
+ You cannot use subqueries with the <codeph>CASE</codeph> function to generate the comparison value, the
+ values to be compared against, or the return value.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ A subquery is not allowed in the filter condition for the <codeph>HAVING</codeph> clause. (Strictly
+ speaking, a subquery cannot appear anywhere outside the <codeph>WITH</codeph>, <codeph>FROM</codeph>, and
+ <codeph>WHERE</codeph> clauses.)
+ </p>
+ </li>
+
+ <li>
+ <p>
+ You must use a fully qualified name
+ (<codeph><varname>table_name</varname>.<varname>column_name</varname></codeph> or
+ <codeph><varname>database_name</varname>.<varname>table_name</varname>.<varname>column_name</varname></codeph>)
+ when referring to any column from the outer query block within a subquery.
+ </p>
+ </li>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p rev="2.3.0">
+ For the complex types (<codeph>ARRAY</codeph>, <codeph>STRUCT</codeph>, and <codeph>MAP</codeph>)
+ available in CDH 5.5 / Impala 2.3 and higher, the join queries that <q>unpack</q> complex type
+ columns often use correlated subqueries in the <codeph>FROM</codeph> clause.
+ For example, if the first table in the join clause is <codeph>CUSTOMER</codeph>, the second
+ join clause might have a subquery that selects from the column <codeph>CUSTOMER.C_ORDERS</codeph>,
+ which is an <codeph>ARRAY</codeph>. The subquery re-evaluates the <codeph>ARRAY</codeph> elements
+ corresponding to each row from the <codeph>CUSTOMER</codeph> table.
+ See <xref href="impala_complex_types.xml#complex_types"/> for details and examples of
+ using subqueries with complex types.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_operators.xml#exists"/>, <xref href="impala_operators.xml#in"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_sum.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_sum.xml b/docs/topics/impala_sum.xml
new file mode 100644
index 0000000..6d25f1c
--- /dev/null
+++ b/docs/topics/impala_sum.xml
@@ -0,0 +1,236 @@
+<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="sum">
+
+ <title>SUM Function</title>
+ <titlealts><navtitle>SUM</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="Analytic Functions"/>
+ <data name="Category" value="Aggregate Functions"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">sum() function</indexterm>
+ An aggregate function that returns the sum of a set of numbers. Its single argument can be numeric column, or
+ the numeric result of a function or expression applied to the column value. Rows with a <codeph>NULL</codeph>
+ value for the specified column are ignored. If the table is empty, or all the values supplied to
+ <codeph>MIN</codeph> are <codeph>NULL</codeph>, <codeph>SUM</codeph> returns <codeph>NULL</codeph>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>SUM([DISTINCT | ALL] <varname>expression</varname>) [OVER (<varname>analytic_clause</varname>)]</codeblock>
+
+ <p>
+ When the query contains a <codeph>GROUP BY</codeph> clause, returns one value for each combination of
+ grouping values.
+ </p>
+
+ <p>
+ <b>Return type:</b> <codeph>BIGINT</codeph> for integer arguments, <codeph>DOUBLE</codeph> for floating-point
+ arguments
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_aggregation_explanation"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_aggregation_example"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example shows how to use <codeph>SUM()</codeph> to compute the total for all the values in the
+ table, a subset of values, or the sum for each combination of values in the <codeph>GROUP BY</codeph> clause:
+ </p>
+
+<codeblock>-- Total all the values for this column in the table.
+select sum(c1) from t1;
+-- Find the total for this column from a subset of the table.
+select sum(c1) from t1 where month = 'January' and year = '2013';
+-- Find the total from a set of numeric function results.
+select sum(length(s)) from t1;
+-- Often used with functions that return predefined values to compute a score.
+select sum(case when grade = 'A' then 1.0 when grade = 'B' then 0.75 else 0) as class_honors from test_scores;
+-- Can also be used in combination with DISTINCT and/or GROUP BY.
+-- Return more than one result.
+select month, year, sum(purchase_price) from store_stats group by month, year;
+-- Filter the input to eliminate duplicates before performing the calculation.
+select sum(distinct x) from t1;
+</codeblock>
+
+ <p rev="2.0.0">
+ The following examples show how to use <codeph>SUM()</codeph> in an analytic context. They use a table
+ containing integers from 1 to 10. Notice how the <codeph>SUM()</codeph> is reported for each input value, as
+ opposed to the <codeph>GROUP BY</codeph> clause which condenses the result set.
+<codeblock>select x, property, sum(x) <b>over (partition by property)</b> as sum from int_t where property in ('odd','even');
++----+----------+-----+
+| x | property | sum |
++----+----------+-----+
+| 2 | even | 30 |
+| 4 | even | 30 |
+| 6 | even | 30 |
+| 8 | even | 30 |
+| 10 | even | 30 |
+| 1 | odd | 25 |
+| 3 | odd | 25 |
+| 5 | odd | 25 |
+| 7 | odd | 25 |
+| 9 | odd | 25 |
++----+----------+-----+
+</codeblock>
+
+Adding an <codeph>ORDER BY</codeph> clause lets you experiment with results that are cumulative or apply to a moving
+set of rows (the <q>window</q>). The following examples use <codeph>SUM()</codeph> in an analytic context
+(that is, with an <codeph>OVER()</codeph> clause) to produce a running total of all the even values,
+then a running total of all the odd values. The basic <codeph>ORDER BY x</codeph> clause implicitly
+activates a window clause of <codeph>RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW</codeph>,
+which is effectively the same as <codeph>ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW</codeph>,
+therefore all of these examples produce the same results:
+<codeblock>select x, property,
+ sum(x) over (partition by property <b>order by x</b>) as 'cumulative total'
+ from int_t where property in ('odd','even');
++----+----------+------------------+
+| x | property | cumulative total |
++----+----------+------------------+
+| 2 | even | 2 |
+| 4 | even | 6 |
+| 6 | even | 12 |
+| 8 | even | 20 |
+| 10 | even | 30 |
+| 1 | odd | 1 |
+| 3 | odd | 4 |
+| 5 | odd | 9 |
+| 7 | odd | 16 |
+| 9 | odd | 25 |
++----+----------+------------------+
+
+select x, property,
+ sum(x) over
+ (
+ partition by property
+ <b>order by x</b>
+ <b>range between unbounded preceding and current row</b>
+ ) as 'cumulative total'
+from int_t where property in ('odd','even');
++----+----------+------------------+
+| x | property | cumulative total |
++----+----------+------------------+
+| 2 | even | 2 |
+| 4 | even | 6 |
+| 6 | even | 12 |
+| 8 | even | 20 |
+| 10 | even | 30 |
+| 1 | odd | 1 |
+| 3 | odd | 4 |
+| 5 | odd | 9 |
+| 7 | odd | 16 |
+| 9 | odd | 25 |
++----+----------+------------------+
+
+select x, property,
+ sum(x) over
+ (
+ partition by property
+ <b>order by x</b>
+ <b>rows between unbounded preceding and current row</b>
+ ) as 'cumulative total'
+ from int_t where property in ('odd','even');
++----+----------+------------------+
+| x | property | cumulative total |
++----+----------+------------------+
+| 2 | even | 2 |
+| 4 | even | 6 |
+| 6 | even | 12 |
+| 8 | even | 20 |
+| 10 | even | 30 |
+| 1 | odd | 1 |
+| 3 | odd | 4 |
+| 5 | odd | 9 |
+| 7 | odd | 16 |
+| 9 | odd | 25 |
++----+----------+------------------+
+</codeblock>
+
+Changing the direction of the <codeph>ORDER BY</codeph> clause causes the intermediate
+results of the cumulative total to be calculated in a different order:
+
+<codeblock>select sum(x) over (partition by property <b>order by x desc</b>) as 'cumulative total'
+ from int_t where property in ('odd','even');
++----+----------+------------------+
+| x | property | cumulative total |
++----+----------+------------------+
+| 10 | even | 10 |
+| 8 | even | 18 |
+| 6 | even | 24 |
+| 4 | even | 28 |
+| 2 | even | 30 |
+| 9 | odd | 9 |
+| 7 | odd | 16 |
+| 5 | odd | 21 |
+| 3 | odd | 24 |
+| 1 | odd | 25 |
++----+----------+------------------+
+</codeblock>
+
+The following examples show how to construct a moving window, with a running total taking into account 1 row before
+and 1 row after the current row, within the same partition (all the even values or all the odd values).
+Because of a restriction in the Impala <codeph>RANGE</codeph> syntax, this type of
+moving window is possible with the <codeph>ROWS BETWEEN</codeph> clause but not the <codeph>RANGE BETWEEN</codeph>
+clause:
+<codeblock>select x, property,
+ sum(x) over
+ (
+ partition by property
+ <b>order by x</b>
+ <b>rows between 1 preceding and 1 following</b>
+ ) as 'moving total'
+ from int_t where property in ('odd','even');
++----+----------+--------------+
+| x | property | moving total |
++----+----------+--------------+
+| 2 | even | 6 |
+| 4 | even | 12 |
+| 6 | even | 18 |
+| 8 | even | 24 |
+| 10 | even | 18 |
+| 1 | odd | 4 |
+| 3 | odd | 9 |
+| 5 | odd | 15 |
+| 7 | odd | 21 |
+| 9 | odd | 16 |
++----+----------+--------------+
+
+-- Doesn't work because of syntax restriction on RANGE clause.
+select x, property,
+ sum(x) over
+ (
+ partition by property
+ <b>order by x</b>
+ <b>range between 1 preceding and 1 following</b>
+ ) as 'moving total'
+from int_t where property in ('odd','even');
+ERROR: AnalysisException: RANGE is only supported with both the lower and upper bounds UNBOUNDED or one UNBOUNDED and the other CURRENT ROW.
+</codeblock>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_analytic_functions.xml#analytic_functions"/>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+<!-- This conref appears under SUM(), AVG(), FLOAT, and DOUBLE topics. -->
+
+ <p conref="../shared/impala_common.xml#common/sum_double"/>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_support_start_over.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_support_start_over.xml b/docs/topics/impala_support_start_over.xml
new file mode 100644
index 0000000..2c17b5d
--- /dev/null
+++ b/docs/topics/impala_support_start_over.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="support_start_over">
+
+ <title>SUPPORT_START_OVER Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">SUPPORT_START_OVER query option</indexterm>
+ Leave this setting at its default value.
+ It is a read-only setting, tested by some client applications such as Hue.
+ </p>
+ <p>
+ If you accidentally change it through <cmdname>impala-shell</cmdname>,
+ subsequent queries encounter errors until you undo the change
+ by issuing <codeph>UNSET support_start_over</codeph>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/type_boolean"/>
+ <p conref="../shared/impala_common.xml#common/default_false"/>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_sync_ddl.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_sync_ddl.xml b/docs/topics/impala_sync_ddl.xml
new file mode 100644
index 0000000..b217f67
--- /dev/null
+++ b/docs/topics/impala_sync_ddl.xml
@@ -0,0 +1,56 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.2.1" id="sync_ddl">
+
+ <title>SYNC_DDL Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ <data name="Category" value="DDL"/>
+ <data name="Category" value="SQL"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">SYNC_DDL query option</indexterm>
+ When enabled, causes any DDL operation such as <codeph>CREATE TABLE</codeph> or <codeph>ALTER TABLE</codeph>
+ to return only when the changes have been propagated to all other Impala nodes in the cluster by the Impala
+ catalog service. That way, if you issue a subsequent <codeph>CONNECT</codeph> statement in
+ <cmdname>impala-shell</cmdname> to connect to a different node in the cluster, you can be sure that other
+ node will already recognize any added or changed tables. (The catalog service automatically broadcasts the
+ DDL changes to all nodes automatically, but without this option there could be a period of inconsistency if
+ you quickly switched to another node, such as by issuing a subsequent query through a load-balancing proxy.)
+ </p>
+
+ <p>
+ Although <codeph>INSERT</codeph> is classified as a DML statement, when the <codeph>SYNC_DDL</codeph> option
+ is enabled, <codeph>INSERT</codeph> statements also delay their completion until all the underlying data and
+ metadata changes are propagated to all Impala nodes. Internally, Impala inserts have similarities with DDL
+ statements in traditional database systems, because they create metadata needed to track HDFS block locations
+ for new files and they potentially add new partitions to partitioned tables.
+ </p>
+
+ <note>
+ Because this option can introduce a delay after each write operation, if you are running a sequence of
+ <codeph>CREATE DATABASE</codeph>, <codeph>CREATE TABLE</codeph>, <codeph>ALTER TABLE</codeph>,
+ <codeph>INSERT</codeph>, and similar statements within a setup script, to minimize the overall delay you can
+ enable the <codeph>SYNC_DDL</codeph> query option only near the end, before the final DDL statement.
+ </note>
+
+ <p conref="../shared/impala_common.xml#common/type_boolean"/>
+ <p conref="../shared/impala_common.xml#common/default_false_0"/>
+
+ <draft-comment translate="no">
+Example could be useful here.
+</draft-comment>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+ <p>
+ <xref href="impala_ddl.xml#ddl"/>
+ </p>
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_tables.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_tables.xml b/docs/topics/impala_tables.xml
new file mode 100644
index 0000000..30e3737
--- /dev/null
+++ b/docs/topics/impala_tables.xml
@@ -0,0 +1,258 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="tables">
+
+ <title>Overview of Impala Tables</title>
+ <titlealts><navtitle>Tables</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Databases"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Tables"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p/>
+
+ <p>
+ Tables are the primary containers for data in Impala. They have the familiar row and column layout similar to
+ other database systems, plus some features such as partitioning often associated with higher-end data
+ warehouse systems.
+ </p>
+
+ <p>
+ Logically, each table has a structure based on the definition of its columns, partitions, and other
+ properties.
+ </p>
+
+ <p>
+ Physically, each table that uses HDFS storage is associated with a directory in HDFS. The table data consists of all the data files
+ underneath that directory:
+ </p>
+
+ <ul>
+ <li>
+ <xref href="impala_tables.xml#internal_tables">Internal tables</xref> are managed by Impala, and use directories
+ inside the designated Impala work area.
+ </li>
+
+ <li>
+ <xref href="impala_tables.xml#external_tables">External tables</xref> use arbitrary HDFS directories, where
+ the data files are typically shared between different Hadoop components.
+ </li>
+
+ <li>
+ Large-scale data is usually handled by partitioned tables, where the data files are divided among different
+ HDFS subdirectories.
+ </li>
+ </ul>
+
+ <p rev="2.2.0">
+ Impala tables can also represent data that is stored in HBase, or in the Amazon S3 filesystem (CDH 5.4.0 or higher),
+ or on Isilon storage devices (CDH 5.4.3 or higher). See <xref href="impala_hbase.xml#impala_hbase"/>,
+ <xref href="impala_s3.xml#s3"/>, and <xref href="impala_isilon.xml#impala_isilon"/>
+ for details about those special kinds of tables.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/ignore_file_extensions"/>
+
+ <p>
+ <b>Related statements:</b> <xref href="impala_create_table.xml#create_table"/>,
+ <xref href="impala_drop_table.xml#drop_table"/>, <xref href="impala_alter_table.xml#alter_table"/>
+ <xref href="impala_insert.xml#insert"/>, <xref href="impala_load_data.xml#load_data"/>,
+ <xref href="impala_select.xml#select"/>
+ </p>
+ </conbody>
+
+ <concept id="internal_tables">
+
+ <title>Internal Tables</title>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">internal tables</indexterm>
+ The default kind of table produced by the <codeph>CREATE TABLE</codeph> statement is known as an internal
+ table. (Its counterpart is the external table, produced by the <codeph>CREATE EXTERNAL TABLE</codeph>
+ syntax.)
+ </p>
+
+ <ul>
+ <li>
+ <p>
+ Impala creates a directory in HDFS to hold the data files.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ You can create data in internal tables by issuing <codeph>INSERT</codeph> or <codeph>LOAD DATA</codeph>
+ statements.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ If you add or replace data using HDFS operations, issue the <codeph>REFRESH</codeph> command in
+ <cmdname>impala-shell</cmdname> so that Impala recognizes the changes in data files, block locations,
+ and so on.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ When you issue a <codeph>DROP TABLE</codeph> statement, Impala physically removes all the data files
+ from the directory.
+ </p>
+ </li>
+
+ <li>
+ <p conref="../shared/impala_common.xml#common/check_internal_external_table"/>
+ </li>
+
+ <li>
+ <p>
+ When you issue an <codeph>ALTER TABLE</codeph> statement to rename an internal table, all data files
+ are moved into the new HDFS directory for the table. The files are moved even if they were formerly in
+ a directory outside the Impala data directory, for example in an internal table with a
+ <codeph>LOCATION</codeph> attribute pointing to an outside HDFS directory.
+ </p>
+ </li>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/switch_internal_external_table"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_tables.xml#external_tables"/>, <xref href="impala_create_table.xml#create_table"/>,
+ <xref href="impala_drop_table.xml#drop_table"/>, <xref href="impala_alter_table.xml#alter_table"/>,
+ <xref href="impala_describe.xml#describe"/>
+ </p>
+ </conbody>
+ </concept>
+
+ <concept id="external_tables">
+
+ <title>External Tables</title>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">external tables</indexterm>
+ The syntax <codeph>CREATE EXTERNAL TABLE</codeph> sets up an Impala table that points at existing data
+ files, potentially in HDFS locations outside the normal Impala data directories.. This operation saves the
+ expense of importing the data into a new table when you already have the data files in a known location in
+ HDFS, in the desired file format.
+ </p>
+
+ <ul>
+ <li>
+ <p>
+ You can use Impala to query the data in this table.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ You can create data in external tables by issuing <codeph>INSERT</codeph> or <codeph>LOAD DATA</codeph>
+ statements.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ If you add or replace data using HDFS operations, issue the <codeph>REFRESH</codeph> command in
+ <cmdname>impala-shell</cmdname> so that Impala recognizes the changes in data files, block locations,
+ and so on.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ When you issue a <codeph>DROP TABLE</codeph> statement in Impala, that removes the connection that
+ Impala has with the associated data files, but does not physically remove the underlying data. You can
+ continue to use the data files with other Hadoop components and HDFS operations.
+ </p>
+ </li>
+
+ <li>
+ <p conref="../shared/impala_common.xml#common/check_internal_external_table"/>
+ </li>
+
+ <li>
+ <p>
+ When you issue an <codeph>ALTER TABLE</codeph> statement to rename an external table, all data files
+ are left in their original locations.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ You can point multiple external tables at the same HDFS directory by using the same
+ <codeph>LOCATION</codeph> attribute for each one. The tables could have different column definitions,
+ as long as the number and types of columns are compatible with the schema evolution considerations for
+ the underlying file type. For example, for text data files, one table might define a certain column as
+ a <codeph>STRING</codeph> while another defines the same column as a <codeph>BIGINT</codeph>.
+ </p>
+ </li>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/switch_internal_external_table"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_tables.xml#internal_tables"/>, <xref href="impala_create_table.xml#create_table"/>,
+ <xref href="impala_drop_table.xml#drop_table"/>, <xref href="impala_alter_table.xml#alter_table"/>,
+ <xref href="impala_describe.xml#describe"/>
+ </p>
+ </conbody>
+ </concept>
+
+ <concept id="table_file_formats">
+ <title>File Formats</title>
+ <conbody>
+ <p>
+ Each table has an associated file format, which determines how Impala interprets the
+ associated data files. See <xref href="impala_file_formats.xml#file_formats"/> for details.
+ </p>
+ <p>
+ You set the file format during the <codeph>CREATE TABLE</codeph> statement,
+ or change it later using the <codeph>ALTER TABLE</codeph> statement.
+ Partitioned tables can have a different file format for individual partitions,
+ allowing you to change the file format used in your ETL process for new data
+ without going back and reconverting all the existing data in the same table.
+ </p>
+ <p>
+ Any <codeph>INSERT</codeph> statements produce new data files with the current file format of the table.
+ For existing data files, changing the file format of the table does not automatically do any data conversion.
+ You must use <codeph>TRUNCATE TABLE</codeph> or <codeph>INSERT OVERWRITE</codeph> to remove any previous data
+ files that use the old file format.
+ Then you use the <codeph>LOAD DATA</codeph> statement, <codeph>INSERT ... SELECT</codeph>, or other mechanism
+ to put data files of the correct format into the table.
+ </p>
+ <p>
+ The default file format, text, is the most flexible and easy to produce when you are just getting started with
+ Impala. The Parquet file format offers the highest query performance and uses compression to reduce storage
+ requirements; therefore, Cloudera recommends using Parquet for Impala tables with substantial amounts of data.
+ <ph rev="2.3.0">Also, the complex types (<codeph>ARRAY</codeph>, <codeph>STRUCT</codeph>, and <codeph>MAP</codeph>)
+ available in CDH 5.5 / Impala 2.3 and higher are currently only supported with the Parquet file type.</ph>
+ Based on your existing ETL workflow, you might use other file formats such as Avro, possibly doing a final
+ conversion step to Parquet to take advantage of its performance for analytic queries.
+ </p>
+ </conbody>
+ </concept>
+
+</concept>
[19/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Posted by jr...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_analytic_functions.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_analytic_functions.xml b/docs/topics/impala_analytic_functions.xml
new file mode 100644
index 0000000..293a512
--- /dev/null
+++ b/docs/topics/impala_analytic_functions.xml
@@ -0,0 +1,1742 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="2.0.0" id="analytic_functions">
+
+ <title>Impala Analytic Functions</title>
+
+ <titlealts>
+
+ <navtitle>Analytic Functions</navtitle>
+
+ </titlealts>
+
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="Aggregate Functions"/>
+ <data name="Category" value="Analytic Functions"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">analytic functions</indexterm>
+
+ <indexterm audience="Cloudera">window functions</indexterm>
+ Analytic functions (also known as window functions) are a special category of built-in functions. Like
+ aggregate functions, they examine the contents of multiple input rows to compute each output value. However,
+ rather than being limited to one result value per <codeph>GROUP BY</codeph> group, they operate on
+ <term>windows</term> where the input rows are ordered and grouped using flexible conditions expressed through
+ an <codeph>OVER()</codeph> clause.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/added_in_20"/>
+
+<!--
+ <p>
+ Analytic functions produce one output value for each input row, like scalar functions such as
+ <codeph>length()</codeph> or
+ <codeph>substr()</codeph>.
+ </p>
+-->
+
+ <p>
+ Some functions, such as <codeph>LAG()</codeph> and <codeph>RANK()</codeph>, can only be used in this analytic
+ context. Some aggregate functions do double duty: when you call the aggregation functions such as
+ <codeph>MAX()</codeph>, <codeph>SUM()</codeph>, <codeph>AVG()</codeph>, and so on with an
+ <codeph>OVER()</codeph> clause, they produce an output value for each row, based on computations across other
+ rows in the window.
+ </p>
+
+ <p>
+ Although analytic functions often compute the same value you would see from an aggregate function in a
+ <codeph>GROUP BY</codeph> query, the analytic functions produce a value for each row in the result set rather
+ than a single value for each group. This flexibility lets you include additional columns in the
+ <codeph>SELECT</codeph> list, offering more opportunities for organizing and filtering the result set.
+ </p>
+
+ <p>
+ Analytic function calls are only allowed in the <codeph>SELECT</codeph> list and in the outermost
+ <codeph>ORDER BY</codeph> clause of the query. During query processing, analytic functions are evaluated
+ after other query stages such as joins, <codeph>WHERE</codeph>, and <codeph>GROUP BY</codeph>,
+ </p>
+
+<!-- Oracle doesn't show examples until it gets to the actual functions, so let's follow that lead.
+ <p>
+ The following example shows a very simple call to <codeph>MAX()</codeph> in
+ an analytic context, and a similar query using a <codeph>GROUP BY</codeph> clause.
+ </p>
+-->
+
+<!--
+This basic query could be represented either as an analytic
+function call or an aggregation function call in a <codeph>GROUP BY</codeph> query.
+For more elaborate kinds of computations, the flexibility of the analytic window
+makes that the preferred option.
+-->
+
+<!-- TK: construct sample data and fill in query results. -->
+
+<!-- Other DB docs don't necessarily include examples up at this level, only for the individual functions.
+ So maybe take these placeholders out entirely.
+
+<codeblock>SELECT year, month, max(degrees_c) OVER (PARTITION BY year) FROM historical_temps;
+SELECT year, month, max(degrees_c) FROM historical_temps GROUP BY year;
+</codeblock>
+-->
+
+ <p>
+ The rows that are part of each partition are analyzed by computations across an ordered or unordered set of
+ rows. For example, <codeph>COUNT()</codeph> and <codeph>SUM()</codeph> might be applied to all the rows in
+ the partition, in which case the order of analysis does not matter. The <codeph>ORDER BY</codeph> clause
+ might be used inside the <codeph>OVER()</codeph> clause to defines the ordering that applies to functions
+ such as <codeph>LAG()</codeph> and <codeph>FIRST_VALUE()</codeph>.
+ </p>
+
+<!-- TK: output needed here also. -->
+
+<!--
+<codeblock>SELECT year, month, max(degrees_c) OVER (PARTITION BY year ORDER BY MONTH DESC) FROM historical_temps;
+</codeblock>
+-->
+
+ <p>
+ Analytic functions are frequently used in fields such as finance and science to provide trend, outlier, and
+ bucketed analysis for large data sets. You might also see the term <q>window functions</q> in database
+ literature, referring to the sequence of rows (the <q>window</q>) that the function call applies to,
+ particularly when the <codeph>OVER</codeph> clause includes a <codeph>ROWS</codeph> or <codeph>RANGE</codeph>
+ keyword.
+ </p>
+
+ <p>
+ The following sections describe the analytic query clauses and the pure analytic functions provided by
+ Impala. For usage information about aggregate functions in an analytic context, see
+ <xref href="impala_aggregate_functions.xml#aggregate_functions"/>.
+ </p>
+
+ <p outputclass="toc inpage"/>
+
+ </conbody>
+
+ <concept id="over">
+
+ <title>OVER Clause</title>
+
+ <conbody>
+
+ <p>
+ The <codeph>OVER</codeph> clause is required for calls to pure analytic functions such as
+ <codeph>LEAD()</codeph>, <codeph>RANK()</codeph>, and <codeph>FIRST_VALUE()</codeph>. When you include an
+ <codeph>OVER</codeph> clause with calls to aggregate functions such as <codeph>MAX()</codeph>,
+ <codeph>COUNT()</codeph>, or <codeph>SUM()</codeph>, they operate as analytic functions.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>function(<varname>args</varname>) OVER([<varname>partition_by_clause</varname>] [<varname>order_by_clause</varname> [<varname>window_clause</varname>]])
+
+partition_by_clause ::= PARTITION BY <varname>expr</varname> [, <varname>expr</varname> ...]
+order_by_clause ::= ORDER BY <varname>expr</varname> [ASC | DESC] [NULLS FIRST | NULLS LAST] [, <varname>expr</varname> [ASC | DESC] [NULLS FIRST | NULLS LAST] ...]
+window_clause: See <xref href="#window_clause">Window Clause</xref>
+</codeblock>
+
+ <p>
+ <b>PARTITION BY clause:</b>
+ </p>
+
+ <p>
+ The <codeph>PARTITION BY</codeph> clause acts much like the <codeph>GROUP BY</codeph> clause in the
+ outermost block of a query. It divides the rows into groups containing identical values in one or more
+ columns. These logical groups are known as <term>partitions</term>. Throughout the discussion of analytic
+ functions, <q>partitions</q> refers to the groups produced by the <codeph>PARTITION BY</codeph> clause, not
+ to partitioned tables.
+ </p>
+
+ <p>
+ The sequence of results from an analytic function <q>resets</q> for each new partition in the result set.
+ That is, the set of preceding or following rows considered by the analytic function always come from a
+ single partition. Any <codeph>MAX()</codeph>, <codeph>SUM()</codeph>, <codeph>ROW_NUMBER()</codeph>, and so
+ on apply to each partition independently. Omit the <codeph>PARTITION BY</codeph> clause to apply the
+ analytic operation to all the rows in the table.
+ </p>
+
+ <p>
+ <b>ORDER BY clause:</b>
+ </p>
+
+ <p>
+ The <codeph>ORDER BY</codeph> clause works much like the <codeph>ORDER BY</codeph> clause in the outermost
+ block of a query. It defines the order in which rows are evaluated for the entire input set, or for each
+ group produced by a <codeph>PARTITION BY</codeph> clause. You can order by one or multiple expressions, and
+ for each expression optionally choose ascending or descending order and whether nulls come first or last in
+ the sort order. Because this <codeph>ORDER BY</codeph> clause only defines the order in which rows are
+ evaluated, if you want the results to be output in a specific order, also include an <codeph>ORDER
+ BY</codeph> clause in the outer block of the query.
+ </p>
+
+ <p>
+ When the <codeph>ORDER BY</codeph> clause is omitted, the analytic function applies to all items in the
+ group produced by the <codeph>PARTITION BY</codeph> clause. When the <codeph>ORDER BY</codeph> clause is
+ included, the analysis can apply to all or a subset of the items in the group, depending on the optional
+ window clause.
+ </p>
+
+ <p>
+ The order in which the rows are analyzed is only defined for those columns specified in <codeph>ORDER
+ BY</codeph> clauses.
+ </p>
+
+ <p>
+ One difference between the analytic and outer uses of the <codeph>ORDER BY</codeph> clause: inside the
+ <codeph>OVER</codeph> clause, <codeph>ORDER BY 1</codeph> or other integer value is interpreted as a
+ constant sort value (effectively a no-op) rather than referring to column 1.
+ </p>
+
+ <p>
+ <b>Window clause:</b>
+ </p>
+
+ <p>
+ The window clause is only allowed in combination with an <codeph>ORDER BY</codeph> clause. If the
+ <codeph>ORDER BY</codeph> clause is specified but the window clause is not, the default window is
+ <codeph>RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW</codeph>. See
+ <xref href="impala_analytic_functions.xml#window_clause"/> for full details.
+ </p>
+
+<!--
+ <p conref="/Content/impala_common_xi44078.xml#common/usage_notes_blurb"/>
+-->
+
+ <p conref="../shared/impala_common.xml#common/hbase_blurb"/>
+
+ <p>
+ Because HBase tables are optimized for single-row lookups rather than full scans, analytic functions using
+ the <codeph>OVER()</codeph> clause are not recommended for HBase tables. Although such queries work, their
+ performance is lower than on comparable tables using HDFS data files.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/parquet_blurb"/>
+
+ <p>
+ Analytic functions are very efficient for Parquet tables. The data that is examined during evaluation of
+ the <codeph>OVER()</codeph> clause comes from a specified set of columns, and the values for each column
+ are arranged sequentially within each data file.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/text_blurb"/>
+
+ <p>
+ Analytic functions are convenient to use with text tables for exploratory business intelligence. When the
+ volume of data is substantial, prefer to use Parquet tables for performance-critical analytic queries.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/added_in_20"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example shows how to synthesize a numeric sequence corresponding to all the rows in a table.
+ The new table has the same columns as the old one, plus an additional column <codeph>ID</codeph> containing
+ the integers 1, 2, 3, and so on, corresponding to the order of a <codeph>TIMESTAMP</codeph> column in the
+ original table.
+ </p>
+
+<!-- TK: synthesize some data and fill in output here. -->
+
+<codeblock>CREATE TABLE events_with_id AS
+ SELECT
+ row_number() OVER (ORDER BY date_and_time) AS id,
+ c1, c2, c3, c4
+ FROM events;
+</codeblock>
+
+ <p>
+ The following example shows how to determine the number of rows containing each value for a column. Unlike
+ a corresponding <codeph>GROUP BY</codeph> query, this one can analyze a single column and still return all
+ values (not just the distinct ones) from the other columns.
+ </p>
+
+<!-- TK: verify the 'unbounded' shortcut syntax. -->
+
+<codeblock>SELECT x, y, z,
+ count() OVER (PARTITION BY x) AS how_many_x
+FROM t1;
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <p>
+ You cannot directly combine the <codeph>DISTINCT</codeph> operator with analytic function calls. You can
+ put the analytic function call in a <codeph>WITH</codeph> clause or an inline view, and apply the
+ <codeph>DISTINCT</codeph> operator to its result set.
+ </p>
+
+<codeblock>WITH t1 AS (SELECT x, sum(x) OVER (PARTITION BY x) AS total FROM t1)
+ SELECT DISTINCT x, total FROM t1;
+</codeblock>
+
+ </conbody>
+
+ </concept>
+
+ <concept id="window_clause">
+
+ <title>Window Clause</title>
+
+ <conbody>
+
+ <p>
+ Certain analytic functions accept an optional <term>window clause</term>, which makes the function analyze
+ only certain rows <q>around</q> the current row rather than all rows in the partition. For example, you can
+ get a moving average by specifying some number of preceding and following rows, or a running count or
+ running total by specifying all rows up to the current position. This clause can result in different
+ analytic results for rows within the same partition.
+ </p>
+
+ <p>
+ The window clause is supported with the <codeph>AVG()</codeph>, <codeph>COUNT()</codeph>,
+ <codeph>FIRST_VALUE()</codeph>, <codeph>LAST_VALUE()</codeph>, and <codeph>SUM()</codeph> functions.
+<!-- To do: fill in this factoid under MAX and MIN also. -->
+ For <codeph>MAX()</codeph> and <codeph>MIN()</codeph>, the window clause only allowed if the start bound is
+ <codeph>UNBOUNDED PRECEDING</codeph>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>ROWS BETWEEN [ { <varname>m</varname> | UNBOUNDED } PRECEDING | CURRENT ROW] [ AND [CURRENT ROW | { UNBOUNDED | <varname>n</varname> } FOLLOWING] ]
+RANGE BETWEEN [ {<varname>m</varname> | UNBOUNDED } PRECEDING | CURRENT ROW] [ AND [CURRENT ROW | { UNBOUNDED | <varname>n</varname> } FOLLOWING] ]</codeblock>
+
+ <p>
+ <codeph>ROWS BETWEEN</codeph> defines the size of the window in terms of the indexes of the rows in the
+ result set. The size of the window is predictable based on the clauses the position within the result set.
+ </p>
+
+ <p>
+ <codeph>RANGE BETWEEN</codeph> does not currently support numeric arguments to define a variable-size
+ sliding window.
+<!--
+Currently, it effectively works the same as the
+equivalent <codeph>ROWS BETWEEN</codeph> clause.
+-->
+ </p>
+
+<!--
+<p>
+<codeph>RANGE BETWEEN</codeph> defines the size of the window based on arithmetic comparisons
+of the values in the result set.
+The size of the window varies depending on the order and distribution of values.
+</p>
+-->
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/restrictions_blurb"/> -->
+
+ <p>
+ Currently, Impala supports only some combinations of arguments to the <codeph>RANGE</codeph> clause:
+ </p>
+
+ <ul>
+ <li>
+ <codeph>RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW</codeph> (the default when <codeph>ORDER
+ BY</codeph> is specified and the window clause is omitted)
+ </li>
+
+ <li>
+ <codeph>RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING</codeph>
+ </li>
+
+ <li>
+ <codeph>RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING</codeph>
+ </li>
+ </ul>
+
+ <p>
+ When <codeph>RANGE</codeph> is used, <codeph>CURRENT ROW</codeph> includes not just the current row but all
+ rows that are tied with the current row based on the <codeph>ORDER BY</codeph> expressions.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/added_in_20"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following examples show financial data for a fictional stock symbol <codeph>JDR</codeph>. The closing
+ price moves up and down each day.
+ </p>
+
+<codeblock>create table stock_ticker (stock_symbol string, closing_price decimal(8,2), closing_date timestamp);
+...load some data...
+select * from stock_ticker order by stock_symbol, closing_date
++--------------+---------------+---------------------+
+| stock_symbol | closing_price | closing_date |
++--------------+---------------+---------------------+
+| JDR | 12.86 | 2014-10-02 00:00:00 |
+| JDR | 12.89 | 2014-10-03 00:00:00 |
+| JDR | 12.94 | 2014-10-04 00:00:00 |
+| JDR | 12.55 | 2014-10-05 00:00:00 |
+| JDR | 14.03 | 2014-10-06 00:00:00 |
+| JDR | 14.75 | 2014-10-07 00:00:00 |
+| JDR | 13.98 | 2014-10-08 00:00:00 |
++--------------+---------------+---------------------+
+</codeblock>
+
+ <p>
+ The queries use analytic functions with window clauses to compute moving averages of the closing price. For
+ example, <codeph>ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING</codeph> produces an average of the value from a
+ 3-day span, producing a different value for each row. The first row, which has no preceding row, only gets
+ averaged with the row following it. If the table contained more than one stock symbol, the
+ <codeph>PARTITION BY</codeph> clause would limit the window for the moving average to only consider the
+ prices for a single stock.
+ </p>
+
+<codeblock>select stock_symbol, closing_date, closing_price,
+ avg(closing_price) over (partition by stock_symbol order by closing_date
+ rows between 1 preceding and 1 following) as moving_average
+ from stock_ticker;
++--------------+---------------------+---------------+----------------+
+| stock_symbol | closing_date | closing_price | moving_average |
++--------------+---------------------+---------------+----------------+
+| JDR | 2014-10-02 00:00:00 | 12.86 | 12.87 |
+| JDR | 2014-10-03 00:00:00 | 12.89 | 12.89 |
+| JDR | 2014-10-04 00:00:00 | 12.94 | 12.79 |
+| JDR | 2014-10-05 00:00:00 | 12.55 | 13.17 |
+| JDR | 2014-10-06 00:00:00 | 14.03 | 13.77 |
+| JDR | 2014-10-07 00:00:00 | 14.75 | 14.25 |
+| JDR | 2014-10-08 00:00:00 | 13.98 | 14.36 |
++--------------+---------------------+---------------+----------------+
+</codeblock>
+
+ <p>
+ The clause <codeph>ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW</codeph> produces a cumulative moving
+ average, from the earliest data up to the value for each day.
+ </p>
+
+<codeblock>select stock_symbol, closing_date, closing_price,
+ avg(closing_price) over (partition by stock_symbol order by closing_date
+ rows between unbounded preceding and current row) as moving_average
+ from stock_ticker;
++--------------+---------------------+---------------+----------------+
+| stock_symbol | closing_date | closing_price | moving_average |
++--------------+---------------------+---------------+----------------+
+| JDR | 2014-10-02 00:00:00 | 12.86 | 12.86 |
+| JDR | 2014-10-03 00:00:00 | 12.89 | 12.87 |
+| JDR | 2014-10-04 00:00:00 | 12.94 | 12.89 |
+| JDR | 2014-10-05 00:00:00 | 12.55 | 12.81 |
+| JDR | 2014-10-06 00:00:00 | 14.03 | 13.05 |
+| JDR | 2014-10-07 00:00:00 | 14.75 | 13.33 |
+| JDR | 2014-10-08 00:00:00 | 13.98 | 13.42 |
++--------------+---------------------+---------------+----------------+
+</codeblock>
+
+<!-- Matt suggests not always true depending on data. Hiding until I can try myself.
+<p>
+The clause <codeph>RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW</codeph> would produce the same
+output as above. Because <codeph>RANGE</codeph> currently does not support numeric offsets while
+<codeph>ROWS</codeph> does, currently the <codeph>ROWS</codeph> syntax is more flexible.
+</p>
+-->
+
+ </conbody>
+
+ </concept>
+
+ <concept id="avg_analytic">
+
+ <title>AVG() Function - Analytic Context</title>
+
+ <conbody>
+
+ <p>
+ You can include an <codeph>OVER</codeph> clause with a call to this function to use it as an analytic
+ function. See <xref href="impala_avg.xml#avg"/> for details and examples.
+ </p>
+
+ </conbody>
+
+ </concept>
+
+ <concept id="count_analytic">
+
+ <title>COUNT() Function - Analytic Context</title>
+
+ <conbody>
+
+ <p>
+ You can include an <codeph>OVER</codeph> clause with a call to this function to use it as an analytic
+ function. See <xref href="impala_count.xml#count"/> for details and examples.
+ </p>
+
+ </conbody>
+
+ </concept>
+
+ <concept rev="2.3.0" id="cume_dist">
+
+ <title>CUME_DIST() Function (CDH 5.5 or higher only)</title>
+
+ <conbody>
+
+ <p>
+ Returns the cumulative distribution of a value. The value for each row in the result set is greater than 0
+ and less than or equal to 1.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>CUME_DIST (<varname>expr</varname>)
+ OVER ([<varname>partition_by_clause</varname>] <varname>order_by_clause</varname>)
+</codeblock>
+
+ <p>
+ The <codeph>ORDER BY</codeph> clause is required. The <codeph>PARTITION BY</codeph> clause is optional. The
+ window clause is not allowed.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ Within each partition of the result set, the <codeph>CUME_DIST()</codeph> value represents an ascending
+ sequence that ends at 1. Each value represents the proportion of rows in the partition whose values are
+ less than or equal to the value in the current row.
+ </p>
+
+ <p>
+ If the sequence of input values contains ties, the <codeph>CUME_DIST()</codeph> results are identical for the
+ tied values.
+ </p>
+
+ <p>
+ Impala only supports the <codeph>CUME_DIST()</codeph> function in an analytic context, not as a regular
+ aggregate function.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ This example uses a table with 9 rows. The <codeph>CUME_DIST()</codeph>
+ function evaluates the entire table because there is no <codeph>PARTITION BY</codeph> clause,
+ with the rows ordered by the weight of the animal.
+ the sequence of values shows that 1/9 of the values are less than or equal to the lightest
+ animal (mouse), 2/9 of the values are less than or equal to the second-lightest animal,
+ and so on up to the heaviest animal (elephant), where 9/9 of the rows are less than or
+ equal to its weight.
+ </p>
+
+<codeblock>create table animals (name string, kind string, kilos decimal(9,3));
+insert into animals values
+ ('Elephant', 'Mammal', 4000), ('Giraffe', 'Mammal', 1200), ('Mouse', 'Mammal', 0.020),
+ ('Condor', 'Bird', 15), ('Horse', 'Mammal', 500), ('Owl', 'Bird', 2.5),
+ ('Ostrich', 'Bird', 145), ('Polar bear', 'Mammal', 700), ('Housecat', 'Mammal', 5);
+
+select name, cume_dist() over (order by kilos) from animals;
++------------+-----------------------+
+| name | cume_dist() OVER(...) |
++------------+-----------------------+
+| Elephant | 1 |
+| Giraffe | 0.8888888888888888 |
+| Polar bear | 0.7777777777777778 |
+| Horse | 0.6666666666666666 |
+| Ostrich | 0.5555555555555556 |
+| Condor | 0.4444444444444444 |
+| Housecat | 0.3333333333333333 |
+| Owl | 0.2222222222222222 |
+| Mouse | 0.1111111111111111 |
++------------+-----------------------+
+</codeblock>
+
+ <p>
+ Using a <codeph>PARTITION BY</codeph> clause produces a separate sequence for each partition
+ group, in this case one for mammals and one for birds. Because there are 3 birds and 6 mammals,
+ the sequence illustrates how 1/3 of the <q>Bird</q> rows have a <codeph>kilos</codeph> value that is less than or equal to
+ the lightest bird, 1/6 of the <q>Mammal</q> rows have a <codeph>kilos</codeph> value that is less than or equal to
+ the lightest mammal, and so on until both the heaviest bird and heaviest mammal have a <codeph>CUME_DIST()</codeph>
+ value of 1.
+ </p>
+
+<codeblock>select name, kind, cume_dist() over (partition by kind order by kilos) from animals
++------------+--------+-----------------------+
+| name | kind | cume_dist() OVER(...) |
++------------+--------+-----------------------+
+| Ostrich | Bird | 1 |
+| Condor | Bird | 0.6666666666666666 |
+| Owl | Bird | 0.3333333333333333 |
+| Elephant | Mammal | 1 |
+| Giraffe | Mammal | 0.8333333333333334 |
+| Polar bear | Mammal | 0.6666666666666666 |
+| Horse | Mammal | 0.5 |
+| Housecat | Mammal | 0.3333333333333333 |
+| Mouse | Mammal | 0.1666666666666667 |
++------------+--------+-----------------------+
+</codeblock>
+
+ <p>
+ We can reverse the ordering within each partition group by using an <codeph>ORDER BY ... DESC</codeph>
+ clause within the <codeph>OVER()</codeph> clause. Now the lightest (smallest value of <codeph>kilos</codeph>)
+ animal of each kind has a <codeph>CUME_DIST()</codeph> value of 1.
+ </p>
+
+<codeblock>select name, kind, cume_dist() over (partition by kind order by kilos desc) from animals
++------------+--------+-----------------------+
+| name | kind | cume_dist() OVER(...) |
++------------+--------+-----------------------+
+| Owl | Bird | 1 |
+| Condor | Bird | 0.6666666666666666 |
+| Ostrich | Bird | 0.3333333333333333 |
+| Mouse | Mammal | 1 |
+| Housecat | Mammal | 0.8333333333333334 |
+| Horse | Mammal | 0.6666666666666666 |
+| Polar bear | Mammal | 0.5 |
+| Giraffe | Mammal | 0.3333333333333333 |
+| Elephant | Mammal | 0.1666666666666667 |
++------------+--------+-----------------------+
+</codeblock>
+
+ <p>
+ The following example manufactures some rows with identical values in the <codeph>kilos</codeph> column,
+ to demonstrate how the results look in case of tie values. For simplicity, it only shows the <codeph>CUME_DIST()</codeph>
+ sequence for the <q>Bird</q> rows. Now with 3 rows all with a value of 15, all of those rows have the same
+ <codeph>CUME_DIST()</codeph> value. 4/5 of the rows have a value for <codeph>kilos</codeph> that is less than or
+ equal to 15.
+ </p>
+
+<codeblock>insert into animals values ('California Condor', 'Bird', 15), ('Andean Condor', 'Bird', 15)
+
+select name, kind, cume_dist() over (order by kilos) from animals where kind = 'Bird';
++-------------------+------+-----------------------+
+| name | kind | cume_dist() OVER(...) |
++-------------------+------+-----------------------+
+| Ostrich | Bird | 1 |
+| Condor | Bird | 0.8 |
+| California Condor | Bird | 0.8 |
+| Andean Condor | Bird | 0.8 |
+| Owl | Bird | 0.2 |
++-------------------+------+-----------------------+
+</codeblock>
+
+ <p>
+ The following example shows how to use an <codeph>ORDER BY</codeph> clause in the outer block
+ to order the result set in case of ties. Here, all the <q>Bird</q> rows are together, then in descending order
+ by the result of the <codeph>CUME_DIST()</codeph> function, and all tied <codeph>CUME_DIST()</codeph>
+ values are ordered by the animal name.
+ </p>
+
+<codeblock>select name, kind, cume_dist() over (partition by kind order by kilos) as ordering
+ from animals
+where
+ kind = 'Bird'
+order by kind, ordering desc, name;
++-------------------+------+----------+
+| name | kind | ordering |
++-------------------+------+----------+
+| Ostrich | Bird | 1 |
+| Andean Condor | Bird | 0.8 |
+| California Condor | Bird | 0.8 |
+| Condor | Bird | 0.8 |
+| Owl | Bird | 0.2 |
++-------------------+------+----------+
+</codeblock>
+
+ </conbody>
+
+ </concept>
+
+ <concept rev="2.0.0" id="dense_rank">
+
+ <title>DENSE_RANK() Function</title>
+
+ <conbody>
+
+ <p>
+ Returns an ascending sequence of integers, starting with 1. The output sequence produces duplicate integers
+ for duplicate values of the <codeph>ORDER BY</codeph> expressions. After generating duplicate output values
+ for the <q>tied</q> input values, the function continues the sequence with the next higher integer.
+ Therefore, the sequence contains duplicates but no gaps when the input contains duplicates. Starts the
+ sequence over for each group produced by the <codeph>PARTITIONED BY</codeph> clause.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>DENSE_RANK() OVER([<varname>partition_by_clause</varname>] <varname>order_by_clause</varname>)</codeblock>
+
+ <p>
+ The <codeph>PARTITION BY</codeph> clause is optional. The <codeph>ORDER BY</codeph> clause is required. The
+ window clause is not allowed.
+ </p>
+
+<!-- Can make the text for ROW_NUMBER, RANK, and DENSE_RANK identical
+ so it can be conref'ed in all 3 places. -->
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ Often used for top-N and bottom-N queries. For example, it could produce a <q>top 10</q> report including
+ all the items with the 10 highest values, even if several items tied for 1st place.
+ </p>
+
+ <p>
+ Similar to <codeph>ROW_NUMBER</codeph> and <codeph>RANK</codeph>. These functions differ in how they treat
+ duplicate combinations of values.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/added_in_20"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example demonstrates how the <codeph>DENSE_RANK()</codeph> function identifies where each
+ value <q>places</q> in the result set, producing the same result for duplicate values, but with a strict
+ sequence from 1 to the number of groups. For example, when results are ordered by the <codeph>X</codeph>
+ column, both <codeph>1</codeph> values are tied for first; both <codeph>2</codeph> values are tied for
+ second; and so on.
+ </p>
+
+<codeblock>select x, dense_rank() over(order by x) as rank, property from int_t;
++----+------+----------+
+| x | rank | property |
++----+------+----------+
+| 1 | 1 | square |
+| 1 | 1 | odd |
+| 2 | 2 | even |
+| 2 | 2 | prime |
+| 3 | 3 | prime |
+| 3 | 3 | odd |
+| 4 | 4 | even |
+| 4 | 4 | square |
+| 5 | 5 | odd |
+| 5 | 5 | prime |
+| 6 | 6 | even |
+| 6 | 6 | perfect |
+| 7 | 7 | lucky |
+| 7 | 7 | lucky |
+| 7 | 7 | lucky |
+| 7 | 7 | odd |
+| 7 | 7 | prime |
+| 8 | 8 | even |
+| 9 | 9 | square |
+| 9 | 9 | odd |
+| 10 | 10 | round |
+| 10 | 10 | even |
++----+------+----------+
+</codeblock>
+
+ <p>
+ The following examples show how the <codeph>DENSE_RANK()</codeph> function is affected by the
+ <codeph>PARTITION</codeph> property within the <codeph>ORDER BY</codeph> clause.
+ </p>
+
+ <p>
+ Partitioning by the <codeph>PROPERTY</codeph> column groups all the even, odd, and so on values together,
+ and <codeph>DENSE_RANK()</codeph> returns the place of each value within the group, producing several
+ ascending sequences.
+ </p>
+
+<codeblock>select x, dense_rank() over(partition by property order by x) as rank, property from int_t;
++----+------+----------+
+| x | rank | property |
++----+------+----------+
+| 2 | 1 | even |
+| 4 | 2 | even |
+| 6 | 3 | even |
+| 8 | 4 | even |
+| 10 | 5 | even |
+| 7 | 1 | lucky |
+| 7 | 1 | lucky |
+| 7 | 1 | lucky |
+| 1 | 1 | odd |
+| 3 | 2 | odd |
+| 5 | 3 | odd |
+| 7 | 4 | odd |
+| 9 | 5 | odd |
+| 6 | 1 | perfect |
+| 2 | 1 | prime |
+| 3 | 2 | prime |
+| 5 | 3 | prime |
+| 7 | 4 | prime |
+| 10 | 1 | round |
+| 1 | 1 | square |
+| 4 | 2 | square |
+| 9 | 3 | square |
++----+------+----------+
+</codeblock>
+
+ <p>
+ Partitioning by the <codeph>X</codeph> column groups all the duplicate numbers together and returns the
+ place each each value within the group; because each value occurs only 1 or 2 times,
+ <codeph>DENSE_RANK()</codeph> designates each <codeph>X</codeph> value as either first or second within its
+ group.
+ </p>
+
+<codeblock>select x, dense_rank() over(partition by x order by property) as rank, property from int_t;
++----+------+----------+
+| x | rank | property |
++----+------+----------+
+| 1 | 1 | odd |
+| 1 | 2 | square |
+| 2 | 1 | even |
+| 2 | 2 | prime |
+| 3 | 1 | odd |
+| 3 | 2 | prime |
+| 4 | 1 | even |
+| 4 | 2 | square |
+| 5 | 1 | odd |
+| 5 | 2 | prime |
+| 6 | 1 | even |
+| 6 | 2 | perfect |
+| 7 | 1 | lucky |
+| 7 | 1 | lucky |
+| 7 | 1 | lucky |
+| 7 | 2 | odd |
+| 7 | 3 | prime |
+| 8 | 1 | even |
+| 9 | 1 | odd |
+| 9 | 2 | square |
+| 10 | 1 | even |
+| 10 | 2 | round |
++----+------+----------+
+</codeblock>
+
+ <p>
+ The following example shows how <codeph>DENSE_RANK()</codeph> produces a continuous sequence while still
+ allowing for ties. In this case, Croesus and Midas both have the second largest fortune, while Crassus has
+ the third largest. (In <xref href="impala_analytic_functions.xml#rank"/>, you see a similar query with the
+ <codeph>RANK()</codeph> function that shows that while Crassus has the third largest fortune, he is the
+ fourth richest person.)
+ </p>
+
+<codeblock>select dense_rank() over (order by net_worth desc) as placement, name, net_worth from wealth order by placement, name;
++-----------+---------+---------------+
+| placement | name | net_worth |
++-----------+---------+---------------+
+| 1 | Solomon | 2000000000.00 |
+| 2 | Croesus | 1000000000.00 |
+| 2 | Midas | 1000000000.00 |
+| 3 | Crassus | 500000000.00 |
+| 4 | Scrooge | 80000000.00 |
++-----------+---------+---------------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_analytic_functions.xml#rank"/>, <xref href="impala_analytic_functions.xml#row_number"/>
+ </p>
+
+ </conbody>
+
+ </concept>
+
+ <concept rev="2.0.0" id="first_value">
+
+ <title>FIRST_VALUE() Function</title>
+
+ <conbody>
+
+ <p>
+ Returns the expression value from the first row in the window. The return value is <codeph>NULL</codeph> if
+ the input expression is <codeph>NULL</codeph>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>FIRST_VALUE(<varname>expr</varname>) OVER([<varname>partition_by_clause</varname>] <varname>order_by_clause</varname> [<varname>window_clause</varname>])</codeblock>
+
+ <p>
+ The <codeph>PARTITION BY</codeph> clause is optional. The <codeph>ORDER BY</codeph> clause is required. The
+ window clause is optional.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ If any duplicate values occur in the tuples evaluated by the <codeph>ORDER BY</codeph> clause, the result
+ of this function is not deterministic. Consider adding additional <codeph>ORDER BY</codeph> columns to
+ ensure consistent ordering.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/added_in_20"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example shows a table with a wide variety of country-appropriate greetings. For consistency,
+ we want to standardize on a single greeting for each country. The <codeph>FIRST_VALUE()</codeph> function
+ helps to produce a mail merge report where every person from the same country is addressed with the same
+ greeting.
+ </p>
+
+<codeblock>select name, country, greeting from mail_merge
++---------+---------+--------------+
+| name | country | greeting |
++---------+---------+--------------+
+| Pete | USA | Hello |
+| John | USA | Hi |
+| Boris | Germany | Guten tag |
+| Michael | Germany | Guten morgen |
+| Bjorn | Sweden | Hej |
+| Mats | Sweden | Tja |
++---------+---------+--------------+
+
+select country, name,
+ first_value(greeting)
+ over (partition by country order by name, greeting) as greeting
+ from mail_merge;
++---------+---------+-----------+
+| country | name | greeting |
++---------+---------+-----------+
+| Germany | Boris | Guten tag |
+| Germany | Michael | Guten tag |
+| Sweden | Bjorn | Hej |
+| Sweden | Mats | Hej |
+| USA | John | Hi |
+| USA | Pete | Hi |
++---------+---------+-----------+
+</codeblock>
+
+ <p>
+ Changing the order in which the names are evaluated changes which greeting is applied to each group.
+ </p>
+
+<codeblock>select country, name,
+ first_value(greeting)
+ over (partition by country order by name desc, greeting) as greeting
+ from mail_merge;
++---------+---------+--------------+
+| country | name | greeting |
++---------+---------+--------------+
+| Germany | Michael | Guten morgen |
+| Germany | Boris | Guten morgen |
+| Sweden | Mats | Tja |
+| Sweden | Bjorn | Tja |
+| USA | Pete | Hello |
+| USA | John | Hello |
++---------+---------+--------------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_analytic_functions.xml#last_value"/>
+ </p>
+
+ </conbody>
+
+ </concept>
+
+ <concept rev="2.0.0" id="lag">
+
+ <title>LAG() Function</title>
+
+ <conbody>
+
+ <p>
+ This function returns the value of an expression using column values from a preceding row. You specify an
+ integer offset, which designates a row position some number of rows previous to the current row. Any column
+ references in the expression argument refer to column values from that prior row. Typically, the table
+ contains a time sequence or numeric sequence column that clearly distinguishes the ordering of the rows.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>LAG (<varname>expr</varname> [, <varname>offset</varname>] [, <varname>default</varname>])
+ OVER ([<varname>partition_by_clause</varname>] <varname>order_by_clause</varname>)</codeblock>
+
+ <p>
+ The <codeph>ORDER BY</codeph> clause is required. The <codeph>PARTITION BY</codeph> clause is optional. The
+ window clause is not allowed.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ Sometimes used an an alternative to doing a self-join.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/added_in_20"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example uses the same stock data created in <xref href="#window_clause"/>. For each day, the
+ query prints the closing price alongside the previous day's closing price. The first row for each stock
+ symbol has no previous row, so that <codeph>LAG()</codeph> value is <codeph>NULL</codeph>.
+ </p>
+
+<codeblock>select stock_symbol, closing_date, closing_price,
+ lag(closing_price,1) over (partition by stock_symbol order by closing_date) as "yesterday closing"
+ from stock_ticker
+ order by closing_date;
++--------------+---------------------+---------------+-------------------+
+| stock_symbol | closing_date | closing_price | yesterday closing |
++--------------+---------------------+---------------+-------------------+
+| JDR | 2014-09-13 00:00:00 | 12.86 | NULL |
+| JDR | 2014-09-14 00:00:00 | 12.89 | 12.86 |
+| JDR | 2014-09-15 00:00:00 | 12.94 | 12.89 |
+| JDR | 2014-09-16 00:00:00 | 12.55 | 12.94 |
+| JDR | 2014-09-17 00:00:00 | 14.03 | 12.55 |
+| JDR | 2014-09-18 00:00:00 | 14.75 | 14.03 |
+| JDR | 2014-09-19 00:00:00 | 13.98 | 14.75 |
++--------------+---------------------+---------------+-------------------+
+</codeblock>
+
+ <p>
+ The following example does an arithmetic operation between the current row and a value from the previous
+ row, to produce a delta value for each day. This example also demonstrates how <codeph>ORDER BY</codeph>
+ works independently in the different parts of the query. The <codeph>ORDER BY closing_date</codeph> in the
+ <codeph>OVER</codeph> clause makes the query analyze the rows in chronological order. Then the outer query
+ block uses <codeph>ORDER BY closing_date DESC</codeph> to present the results with the most recent date
+ first.
+ </p>
+
+<codeblock>select stock_symbol, closing_date, closing_price,
+ cast(
+ closing_price - lag(closing_price,1) over
+ (partition by stock_symbol order by closing_date)
+ as decimal(8,2)
+ )
+ as "change from yesterday"
+ from stock_ticker
+ order by closing_date desc;
++--------------+---------------------+---------------+-----------------------+
+| stock_symbol | closing_date | closing_price | change from yesterday |
++--------------+---------------------+---------------+-----------------------+
+| JDR | 2014-09-19 00:00:00 | 13.98 | -0.76 |
+| JDR | 2014-09-18 00:00:00 | 14.75 | 0.72 |
+| JDR | 2014-09-17 00:00:00 | 14.03 | 1.47 |
+| JDR | 2014-09-16 00:00:00 | 12.55 | -0.38 |
+| JDR | 2014-09-15 00:00:00 | 12.94 | 0.04 |
+| JDR | 2014-09-14 00:00:00 | 12.89 | 0.03 |
+| JDR | 2014-09-13 00:00:00 | 12.86 | NULL |
++--------------+---------------------+---------------+-----------------------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ This function is the converse of <xref href="impala_analytic_functions.xml#lead"/>.
+ </p>
+
+ </conbody>
+
+ </concept>
+
+ <concept rev="2.0.0" id="last_value">
+
+ <title>LAST_VALUE() Function</title>
+
+ <conbody>
+
+ <p>
+ Returns the expression value from the last row in the window. This same value is repeated for all result
+ rows for the group. The return value is <codeph>NULL</codeph> if the input expression is
+ <codeph>NULL</codeph>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>LAST_VALUE(<varname>expr</varname>) OVER([<varname>partition_by_clause</varname>] <varname>order_by_clause</varname> [<varname>window_clause</varname>])</codeblock>
+
+ <p>
+ The <codeph>PARTITION BY</codeph> clause is optional. The <codeph>ORDER BY</codeph> clause is required. The
+ window clause is optional.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ If any duplicate values occur in the tuples evaluated by the <codeph>ORDER BY</codeph> clause, the result
+ of this function is not deterministic. Consider adding additional <codeph>ORDER BY</codeph> columns to
+ ensure consistent ordering.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/added_in_20"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example uses the same <codeph>MAIL_MERGE</codeph> table as in the example for
+ <xref href="impala_analytic_functions.xml#first_value"/>. Because the default window when <codeph>ORDER
+ BY</codeph> is used is <codeph>BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW</codeph>, the query requires the
+ <codeph>UNBOUNDED FOLLOWING</codeph> to look ahead to subsequent rows and find the last value for each
+ country.
+ </p>
+
+<codeblock>select country, name,
+ last_value(greeting) over (
+ partition by country order by name, greeting
+ rows between unbounded preceding and unbounded following
+ ) as greeting
+ from mail_merge
++---------+---------+--------------+
+| country | name | greeting |
++---------+---------+--------------+
+| Germany | Boris | Guten morgen |
+| Germany | Michael | Guten morgen |
+| Sweden | Bjorn | Tja |
+| Sweden | Mats | Tja |
+| USA | John | Hello |
+| USA | Pete | Hello |
++---------+---------+--------------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_analytic_functions.xml#first_value"/>
+ </p>
+
+ </conbody>
+
+ </concept>
+
+ <concept rev="2.0.0" id="lead">
+
+ <title>LEAD() Function</title>
+
+ <conbody>
+
+ <p>
+ This function returns the value of an expression using column values from a following row. You specify an
+ integer offset, which designates a row position some number of rows after to the current row. Any column
+ references in the expression argument refer to column values from that later row. Typically, the table
+ contains a time sequence or numeric sequence column that clearly distinguishes the ordering of the rows.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>LEAD (<varname>expr</varname> [, <varname>offset</varname>] [, <varname>default</varname>])
+ OVER ([<varname>partition_by_clause</varname>] <varname>order_by_clause</varname>)</codeblock>
+
+ <p>
+ The <codeph>ORDER BY</codeph> clause is required. The <codeph>PARTITION BY</codeph> clause is optional. The
+ window clause is not allowed.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ Sometimes used an an alternative to doing a self-join.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/added_in_20"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example uses the same stock data created in <xref href="#window_clause"/>. The query analyzes
+ the closing price for a stock symbol, and for each day evaluates if the closing price for the following day
+ is higher or lower.
+ </p>
+
+<codeblock>select stock_symbol, closing_date, closing_price,
+ case
+ (lead(closing_price,1)
+ over (partition by stock_symbol order by closing_date)
+ - closing_price) > 0
+ when true then "higher"
+ when false then "flat or lower"
+ end as "trending"
+from stock_ticker
+ order by closing_date;
++--------------+---------------------+---------------+---------------+
+| stock_symbol | closing_date | closing_price | trending |
++--------------+---------------------+---------------+---------------+
+| JDR | 2014-09-13 00:00:00 | 12.86 | higher |
+| JDR | 2014-09-14 00:00:00 | 12.89 | higher |
+| JDR | 2014-09-15 00:00:00 | 12.94 | flat or lower |
+| JDR | 2014-09-16 00:00:00 | 12.55 | higher |
+| JDR | 2014-09-17 00:00:00 | 14.03 | higher |
+| JDR | 2014-09-18 00:00:00 | 14.75 | flat or lower |
+| JDR | 2014-09-19 00:00:00 | 13.98 | NULL |
++--------------+---------------------+---------------+---------------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ This function is the converse of <xref href="impala_analytic_functions.xml#lag"/>.
+ </p>
+
+ </conbody>
+
+ </concept>
+
+ <concept id="max_analytic">
+
+ <title>MAX() Function - Analytic Context</title>
+
+ <conbody>
+
+ <p>
+ You can include an <codeph>OVER</codeph> clause with a call to this function to use it as an analytic
+ function. See <xref href="impala_max.xml#max"/> for details and examples.
+ </p>
+
+ </conbody>
+
+ </concept>
+
+ <concept id="min_analytic">
+
+ <title>MIN() Function - Analytic Context</title>
+
+ <conbody>
+
+ <p>
+ You can include an <codeph>OVER</codeph> clause with a call to this function to use it as an analytic
+ function. See <xref href="impala_min.xml#min"/> for details and examples.
+ </p>
+
+ </conbody>
+
+ </concept>
+
+ <concept audience="Cloudera" rev="2.x.x" id="nth_value">
+
+ <title>NTH_VALUE() Function</title>
+
+ <conbody>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ </conbody>
+
+ </concept>
+
+ <concept rev="2.3.0" id="ntile">
+
+ <title>NTILE() Function (CDH 5.5 or higher only)</title>
+
+ <conbody>
+
+ <p>
+ Returns the <q>bucket number</q> associated with each row, between 1 and the value of an expression. For
+ example, creating 100 buckets puts the lowest 1% of values in the first bucket, while creating 10 buckets
+ puts the lowest 10% of values in the first bucket. Each partition can have a different number of buckets.
+<!-- What's the syntax or data distribution that would create a different number of buckets per partition? -->
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>NTILE (<varname>expr</varname> [, <varname>offset</varname> ...]
+ OVER ([<varname>partition_by_clause</varname>] <varname>order_by_clause</varname>)</codeblock>
+
+ <p>
+ The <codeph>ORDER BY</codeph> clause is required. The <codeph>PARTITION BY</codeph> clause is optional. The
+ window clause is not allowed.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ The <q>ntile</q> name is derived from the practice of dividing result sets into fourths (quartile), tenths
+ (decile), and so on. The <codeph>NTILE()</codeph> function divides the result set based on an arbitrary
+ percentile value.
+ </p>
+
+ <p>
+ The number of buckets must be a positive integer.
+ </p>
+
+ <p>
+ The number of items in each bucket is identical or almost so, varying by at most 1. If the number of items
+ does not divide evenly between the buckets, the remaining N items are divided evenly among the first N
+ buckets.
+ </p>
+
+ <p>
+ If the number of buckets N is greater than the number of input rows in the partition, then the first N
+ buckets each contain one item, and the remaining buckets are empty.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example shows divides groups of animals into 4 buckets based on their weight. The
+ <codeph>ORDER BY ... DESC</codeph> clause in the <codeph>OVER()</codeph> clause means that the heaviest 25%
+ are in the first group, and the lightest 25% are in the fourth group. (The <codeph>ORDER BY</codeph> in the
+ outermost part of the query shows how you can order the final result set independently from the order in
+ which the rows are evaluated by the <codeph>OVER()</codeph> clause.) Because there are 9 rows in the group,
+ divided into 4 buckets, the first bucket receives the extra item.
+ </p>
+
+<codeblock>create table animals (name string, kind string, kilos decimal(9,3));
+
+insert into animals values
+ ('Elephant', 'Mammal', 4000), ('Giraffe', 'Mammal', 1200), ('Mouse', 'Mammal', 0.020),
+ ('Condor', 'Bird', 15), ('Horse', 'Mammal', 500), ('Owl', 'Bird', 2.5),
+ ('Ostrich', 'Bird', 145), ('Polar bear', 'Mammal', 700), ('Housecat', 'Mammal', 5);
+
+select name, ntile(4) over (order by kilos desc) as quarter
+ from animals
+order by quarter desc;
++------------+---------+
+| name | quarter |
++------------+---------+
+| Owl | 4 |
+| Mouse | 4 |
+| Condor | 3 |
+| Housecat | 3 |
+| Horse | 2 |
+| Ostrich | 2 |
+| Elephant | 1 |
+| Giraffe | 1 |
+| Polar bear | 1 |
++------------+---------+
+</codeblock>
+
+ <p>
+ The following examples show how the <codeph>PARTITION</codeph> clause works for the
+ <codeph>NTILE()</codeph> function. Here, we divide each kind of animal (mammal or bird) into 2 buckets,
+ the heavier half and the lighter half.
+ </p>
+
+<codeblock>select name, kind, ntile(2) over (partition by kind order by kilos desc) as half
+ from animals
+order by kind;
++------------+--------+------+
+| name | kind | half |
++------------+--------+------+
+| Ostrich | Bird | 1 |
+| Condor | Bird | 1 |
+| Owl | Bird | 2 |
+| Elephant | Mammal | 1 |
+| Giraffe | Mammal | 1 |
+| Polar bear | Mammal | 1 |
+| Horse | Mammal | 2 |
+| Housecat | Mammal | 2 |
+| Mouse | Mammal | 2 |
++------------+--------+------+
+</codeblock>
+
+ <p>
+ Again, the result set can be ordered independently
+ from the analytic evaluation. This next example lists all the animals heaviest to lightest,
+ showing that elephant and giraffe are in the <q>top half</q> of mammals by weight, while
+ housecat and mouse are in the <q>bottom half</q>.
+ </p>
+
+<codeblock>select name, kind, ntile(2) over (partition by kind order by kilos desc) as half
+ from animals
+order by kilos desc;
++------------+--------+------+
+| name | kind | half |
++------------+--------+------+
+| Elephant | Mammal | 1 |
+| Giraffe | Mammal | 1 |
+| Polar bear | Mammal | 1 |
+| Horse | Mammal | 2 |
+| Ostrich | Bird | 1 |
+| Condor | Bird | 1 |
+| Housecat | Mammal | 2 |
+| Owl | Bird | 2 |
+| Mouse | Mammal | 2 |
++------------+--------+------+
+</codeblock>
+
+ </conbody>
+
+ </concept>
+
+ <concept rev="2.3.0" id="percent_rank">
+
+ <title>PERCENT_RANK() Function (CDH 5.5 or higher only)</title>
+
+ <conbody>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>PERCENT_RANK (<varname>expr</varname>)
+ OVER ([<varname>partition_by_clause</varname>] <varname>order_by_clause</varname>)
+</codeblock>
+
+ <p>
+ Calculates the rank, expressed as a percentage, of each row within a group of rows.
+ If <codeph>rank</codeph> is the value for that same row from the <codeph>RANK()</codeph> function (from 1 to the total number of rows in the partition group),
+ then the <codeph>PERCENT_RANK()</codeph> value is calculated as <codeph>(<varname>rank</varname> - 1) / (<varname>rows_in_group</varname> - 1)</codeph> .
+ If there is only a single item in the partition group, its <codeph>PERCENT_RANK()</codeph> value is 0.
+ </p>
+
+ <p>
+ The <codeph>ORDER BY</codeph> clause is required. The <codeph>PARTITION BY</codeph> clause is optional. The
+ window clause is not allowed.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ This function is similar to the <codeph>RANK</codeph> and <codeph>CUME_DIST()</codeph> functions: it returns an ascending sequence representing the position of each
+ row within the rows of the same partition group. The actual numeric sequence is calculated differently,
+ and the handling of duplicate (tied) values is different.
+ </p>
+
+ <p>
+ The return values range from 0 to 1 inclusive.
+ The first row in each partition group always has the value 0.
+ A <codeph>NULL</codeph> value is considered the lowest possible value.
+ In the case of duplicate input values, all the corresponding rows in the result set
+ have an identical value: the lowest <codeph>PERCENT_RANK()</codeph> value of those
+ tied rows. (In contrast to <codeph>CUME_DIST()</codeph>, where all tied rows have
+ the highest <codeph>CUME_DIST()</codeph> value.)
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example uses the same <codeph>ANIMALS</codeph> table as the examples for <codeph>CUME_DIST()</codeph>
+ and <codeph>NTILE()</codeph>, with a few additional rows to illustrate the results where some values are
+ <codeph>NULL</codeph> or there is only a single row in a partition group.
+ </p>
+
+<codeblock>insert into animals values ('Komodo dragon', 'Reptile', 70);
+insert into animals values ('Unicorn', 'Mythical', NULL);
+insert into animals values ('Fire-breathing dragon', 'Mythical', NULL);
+</codeblock>
+
+ <p>
+ As with <codeph>CUME_DIST()</codeph>, there is an ascending sequence for each kind of animal.
+ For example, the <q>Birds</q> and <q>Mammals</q> rows each have a <codeph>PERCENT_RANK()</codeph> sequence
+ that ranges from 0 to 1.
+ The <q>Reptile</q> row has a <codeph>PERCENT_RANK()</codeph> of 0 because that partition group contains only a single item.
+ Both <q>Mythical</q> animals have a <codeph>PERCENT_RANK()</codeph> of 0 because
+ a <codeph>NULL</codeph> is considered the lowest value within its partition group.
+ </p>
+
+<codeblock>select name, kind, percent_rank() over (partition by kind order by kilos) from animals;
++-----------------------+----------+--------------------------+
+| name | kind | percent_rank() OVER(...) |
++-----------------------+----------+--------------------------+
+| Mouse | Mammal | 0 |
+| Housecat | Mammal | 0.2 |
+| Horse | Mammal | 0.4 |
+| Polar bear | Mammal | 0.6 |
+| Giraffe | Mammal | 0.8 |
+| Elephant | Mammal | 1 |
+| Komodo dragon | Reptile | 0 |
+| Owl | Bird | 0 |
+| California Condor | Bird | 0.25 |
+| Andean Condor | Bird | 0.25 |
+| Condor | Bird | 0.25 |
+| Ostrich | Bird | 1 |
+| Fire-breathing dragon | Mythical | 0 |
+| Unicorn | Mythical | 0 |
++-----------------------+----------+--------------------------+
+</codeblock>
+ </conbody>
+
+ </concept>
+
+ <concept rev="2.0.0" id="rank">
+
+ <title>RANK() Function</title>
+
+ <conbody>
+
+ <p>
+ Returns an ascending sequence of integers, starting with 1. The output sequence produces duplicate integers
+ for duplicate values of the <codeph>ORDER BY</codeph> expressions. After generating duplicate output values
+ for the <q>tied</q> input values, the function increments the sequence by the number of tied values.
+ Therefore, the sequence contains both duplicates and gaps when the input contains duplicates. Starts the
+ sequence over for each group produced by the <codeph>PARTITIONED BY</codeph> clause.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>RANK() OVER([<varname>partition_by_clause</varname>] <varname>order_by_clause</varname>)</codeblock>
+
+ <p>
+ The <codeph>PARTITION BY</codeph> clause is optional. The <codeph>ORDER BY</codeph> clause is required. The
+ window clause is not allowed.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+<!-- Make a little tutorial to show these 3 functions side-by-side and illustrate their difference. -->
+
+ <p>
+ Often used for top-N and bottom-N queries. For example, it could produce a <q>top 10</q> report including
+ several items that were tied for 10th place.
+ </p>
+
+ <p>
+ Similar to <codeph>ROW_NUMBER</codeph> and <codeph>DENSE_RANK</codeph>. These functions differ in how they
+ treat duplicate combinations of values.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/added_in_20"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example demonstrates how the <codeph>RANK()</codeph> function identifies where each value
+ <q>places</q> in the result set, producing the same result for duplicate values, and skipping values in the
+ sequence to account for the number of duplicates. For example, when results are ordered by the
+ <codeph>X</codeph> column, both <codeph>1</codeph> values are tied for first; both <codeph>2</codeph>
+ values are tied for third; and so on.
+ </p>
+
+<codeblock>select x, rank() over(order by x) as rank, property from int_t;
++----+------+----------+
+| x | rank | property |
++----+------+----------+
+| 1 | 1 | square |
+| 1 | 1 | odd |
+| 2 | 3 | even |
+| 2 | 3 | prime |
+| 3 | 5 | prime |
+| 3 | 5 | odd |
+| 4 | 7 | even |
+| 4 | 7 | square |
+| 5 | 9 | odd |
+| 5 | 9 | prime |
+| 6 | 11 | even |
+| 6 | 11 | perfect |
+| 7 | 13 | lucky |
+| 7 | 13 | lucky |
+| 7 | 13 | lucky |
+| 7 | 13 | odd |
+| 7 | 13 | prime |
+| 8 | 18 | even |
+| 9 | 19 | square |
+| 9 | 19 | odd |
+| 10 | 21 | round |
+| 10 | 21 | even |
++----+------+----------+
+</codeblock>
+
+ <p>
+ The following examples show how the <codeph>RANK()</codeph> function is affected by the
+ <codeph>PARTITION</codeph> property within the <codeph>ORDER BY</codeph> clause.
+ </p>
+
+ <p>
+ Partitioning by the <codeph>PROPERTY</codeph> column groups all the even, odd, and so on values together,
+ and <codeph>RANK()</codeph> returns the place of each value within the group, producing several ascending
+ sequences.
+ </p>
+
+<codeblock>select x, rank() over(partition by property order by x) as rank, property from int_t;
++----+------+----------+
+| x | rank | property |
++----+------+----------+
+| 2 | 1 | even |
+| 4 | 2 | even |
+| 6 | 3 | even |
+| 8 | 4 | even |
+| 10 | 5 | even |
+| 7 | 1 | lucky |
+| 7 | 1 | lucky |
+| 7 | 1 | lucky |
+| 1 | 1 | odd |
+| 3 | 2 | odd |
+| 5 | 3 | odd |
+| 7 | 4 | odd |
+| 9 | 5 | odd |
+| 6 | 1 | perfect |
+| 2 | 1 | prime |
+| 3 | 2 | prime |
+| 5 | 3 | prime |
+| 7 | 4 | prime |
+| 10 | 1 | round |
+| 1 | 1 | square |
+| 4 | 2 | square |
+| 9 | 3 | square |
++----+------+----------+
+</codeblock>
+
+ <p>
+ Partitioning by the <codeph>X</codeph> column groups all the duplicate numbers together and returns the
+ place each each value within the group; because each value occurs only 1 or 2 times,
+ <codeph>RANK()</codeph> designates each <codeph>X</codeph> value as either first or second within its
+ group.
+ </p>
+
+<codeblock>select x, rank() over(partition by x order by property) as rank, property from int_t;
++----+------+----------+
+| x | rank | property |
++----+------+----------+
+| 1 | 1 | odd |
+| 1 | 2 | square |
+| 2 | 1 | even |
+| 2 | 2 | prime |
+| 3 | 1 | odd |
+| 3 | 2 | prime |
+| 4 | 1 | even |
+| 4 | 2 | square |
+| 5 | 1 | odd |
+| 5 | 2 | prime |
+| 6 | 1 | even |
+| 6 | 2 | perfect |
+| 7 | 1 | lucky |
+| 7 | 1 | lucky |
+| 7 | 1 | lucky |
+| 7 | 4 | odd |
+| 7 | 5 | prime |
+| 8 | 1 | even |
+| 9 | 1 | odd |
+| 9 | 2 | square |
+| 10 | 1 | even |
+| 10 | 2 | round |
++----+------+----------+
+</codeblock>
+
+ <p>
+ The following example shows how a magazine might prepare a list of history's wealthiest people. Croesus and
+ Midas are tied for second, then Crassus is fourth.
+ </p>
+
+<codeblock>select rank() over (order by net_worth desc) as rank, name, net_worth from wealth order by rank, name;
++------+---------+---------------+
+| rank | name | net_worth |
++------+---------+---------------+
+| 1 | Solomon | 2000000000.00 |
+| 2 | Croesus | 1000000000.00 |
+| 2 | Midas | 1000000000.00 |
+| 4 | Crassus | 500000000.00 |
+| 5 | Scrooge | 80000000.00 |
++------+---------+---------------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_analytic_functions.xml#dense_rank"/>,
+ <xref href="impala_analytic_functions.xml#row_number"/>
+ </p>
+
+ </conbody>
+
+ </concept>
+
+ <concept rev="2.0.0" id="row_number">
+
+ <title>ROW_NUMBER() Function</title>
+
+ <conbody>
+
+ <p>
+ Returns an ascending sequence of integers, starting with 1. Starts the sequence over for each group
+ produced by the <codeph>PARTITIONED BY</codeph> clause. The output sequence includes different values for
+ duplicate input values. Therefore, the sequence never contains any duplicates or gaps, regardless of
+ duplicate input values.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>ROW_NUMBER() OVER([<varname>partition_by_clause</varname>] <varname>order_by_clause</varname>)</codeblock>
+
+ <p>
+ The <codeph>ORDER BY</codeph> clause is required. The <codeph>PARTITION BY</codeph> clause is optional. The
+ window clause is not allowed.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ Often used for top-N and bottom-N queries where the input values are known to be unique, or precisely N
+ rows are needed regardless of duplicate values.
+ </p>
+
+ <p>
+ Because its result value is different for each row in the result set (when used without a <codeph>PARTITION
+ BY</codeph> clause), <codeph>ROW_NUMBER()</codeph> can be used to synthesize unique numeric ID values, for
+ example for result sets involving unique values or tuples.
+ </p>
+
+ <p>
+ Similar to <codeph>RANK</codeph> and <codeph>DENSE_RANK</codeph>. These functions differ in how they treat
+ duplicate combinations of values.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/added_in_20"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example demonstrates how <codeph>ROW_NUMBER()</codeph> produces a continuous numeric
+ sequence, even though some values of <codeph>X</codeph> are repeated.
+ </p>
+
+<codeblock>select x, row_number() over(order by x, property) as row_number, property from int_t;
++----+------------+----------+
+| x | row_number | property |
++----+------------+----------+
+| 1 | 1 | odd |
+| 1 | 2 | square |
+| 2 | 3 | even |
+| 2 | 4 | prime |
+| 3 | 5 | odd |
+| 3 | 6 | prime |
+| 4 | 7 | even |
+| 4 | 8 | square |
+| 5 | 9 | odd |
+| 5 | 10 | prime |
+| 6 | 11 | even |
+| 6 | 12 | perfect |
+| 7 | 13 | lucky |
+| 7 | 14 | lucky |
+| 7 | 15 | lucky |
+| 7 | 16 | odd |
+| 7 | 17 | prime |
+| 8 | 18 | even |
+| 9 | 19 | odd |
+| 9 | 20 | square |
+| 10 | 21 | even |
+| 10 | 22 | round |
++----+------------+----------+
+</codeblock>
+
+ <p>
+ The following example shows how a financial institution might assign customer IDs to some of history's
+ wealthiest figures. Although two of the people have identical net worth figures, unique IDs are required
+ for this purpose. <codeph>ROW_NUMBER()</codeph> produces a sequence of five different values for the five
+ input rows.
+ </p>
+
+<codeblock>select row_number() over (order by net_worth desc) as account_id, name, net_worth
+ from wealth order by account_id, name;
++------------+---------+---------------+
+| account_id | name | net_worth |
++------------+---------+---------------+
+| 1 | Solomon | 2000000000.00 |
+| 2 | Croesus | 1000000000.00 |
+| 3 | Midas | 1000000000.00 |
+| 4 | Crassus | 500000000.00 |
+| 5 | Scrooge | 80000000.00 |
++------------+---------+---------------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_analytic_functions.xml#rank"/>, <xref href="impala_analytic_functions.xml#dense_rank"/>
+ </p>
+
+ </conbody>
+
+ </concept>
+
+ <concept id="sum_analytic">
+
+ <title>SUM() Function - Analytic Context</title>
+
+ <conbody>
+
+ <p>
+ You can include an <codeph>OVER</codeph> clause with a call to this function to use it as an analytic
+ function. See <xref href="impala_sum.xml#sum"/> for details and examples.
+ </p>
+
+ </conbody>
+
+ </concept>
+
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_appx_count_distinct.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_appx_count_distinct.xml b/docs/topics/impala_appx_count_distinct.xml
new file mode 100644
index 0000000..31a9679
--- /dev/null
+++ b/docs/topics/impala_appx_count_distinct.xml
@@ -0,0 +1,77 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="2.0.0" id="appx_count_distinct">
+
+ <title>APPX_COUNT_DISTINCT Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">APPX_COUNT_DISTINCT query option</indexterm>
+ Allows multiple <codeph>COUNT(DISTINCT)</codeph> operations within a single query, by internally rewriting
+ each <codeph>COUNT(DISTINCT)</codeph> to use the <codeph>NDV()</codeph> function. The resulting count is
+ approximate rather than precise.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/type_boolean"/>
+
+ <p conref="../shared/impala_common.xml#common/default_false_0"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following examples show how the <codeph>APPX_COUNT_DISTINCT</codeph> lets you work around the restriction
+ where a query can only evaluate <codeph>COUNT(DISTINCT <varname>col_name</varname>)</codeph> for a single
+ column. By default, you can count the distinct values of one column or another, but not both in a single
+ query:
+ </p>
+
+<codeblock>[localhost:21000] > select count(distinct x) from int_t;
++-------------------+
+| count(distinct x) |
++-------------------+
+| 10 |
++-------------------+
+[localhost:21000] > select count(distinct property) from int_t;
++--------------------------+
+| count(distinct property) |
++--------------------------+
+| 7 |
++--------------------------+
+[localhost:21000] > select count(distinct x), count(distinct property) from int_t;
+ERROR: AnalysisException: all DISTINCT aggregate functions need to have the same set of parameters
+as count(DISTINCT x); deviating function: count(DISTINCT property)
+</codeblock>
+
+ <p>
+ When you enable the <codeph>APPX_COUNT_DISTINCT</codeph> query option, now the query with multiple
+ <codeph>COUNT(DISTINCT)</codeph> works. The reason this behavior requires a query option is that each
+ <codeph>COUNT(DISTINCT)</codeph> is rewritten internally to use the <codeph>NDV()</codeph> function instead,
+ which provides an approximate result rather than a precise count.
+ </p>
+
+<codeblock>[localhost:21000] > set APPX_COUNT_DISTINCT=true;
+[localhost:21000] > select count(distinct x), count(distinct property) from int_t;
++-------------------+--------------------------+
+| count(distinct x) | count(distinct property) |
++-------------------+--------------------------+
+| 10 | 7 |
++-------------------+--------------------------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_count.xml#count"/>,
+ <xref href="impala_distinct.xml#distinct"/>,
+ <xref href="impala_ndv.xml#ndv"/>
+ </p>
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_appx_median.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_appx_median.xml b/docs/topics/impala_appx_median.xml
new file mode 100644
index 0000000..d874ead
--- /dev/null
+++ b/docs/topics/impala_appx_median.xml
@@ -0,0 +1,122 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.2.1" id="appx_median">
+
+ <title>APPX_MEDIAN Function</title>
+ <titlealts><navtitle>APPX_MEDIAN</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="Aggregate Functions"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">appx_median() function</indexterm>
+ An aggregate function that returns a value that is approximately the median (midpoint) of values in the set
+ of input values.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>APPX_MEDIAN([DISTINCT | ALL] <varname>expression</varname>)
+</codeblock>
+
+ <p>
+ This function works with any input type, because the only requirement is that the type supports less-than and
+ greater-than comparison operators.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ Because the return value represents the estimated midpoint, it might not reflect the precise midpoint value,
+ especially if the cardinality of the input values is very high. If the cardinality is low (up to
+ approximately 20,000), the result is more accurate because the sampling considers all or almost all of the
+ different values.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/return_type_same_except_string"/>
+
+ <p>
+ The return value is always the same as one of the input values, not an <q>in-between</q> value produced by
+ averaging.
+ </p>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/restrictions_sliding_window"/> -->
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/analytic_not_allowed_caveat"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example uses a table of a million random floating-point numbers ranging up to approximately
+ 50,000. The average is approximately 25,000. Because of the random distribution, we would expect the median
+ to be close to this same number. Computing the precise median is a more intensive operation than computing
+ the average, because it requires keeping track of every distinct value and how many times each occurs. The
+ <codeph>APPX_MEDIAN()</codeph> function uses a sampling algorithm to return an approximate result, which in
+ this case is close to the expected value. To make sure that the value is not substantially out of range due
+ to a skewed distribution, subsequent queries confirm that there are approximately 500,000 values higher than
+ the <codeph>APPX_MEDIAN()</codeph> value, and approximately 500,000 values lower than the
+ <codeph>APPX_MEDIAN()</codeph> value.
+ </p>
+
+<codeblock>[localhost:21000] > select min(x), max(x), avg(x) from million_numbers;
++-------------------+-------------------+-------------------+
+| min(x) | max(x) | avg(x) |
++-------------------+-------------------+-------------------+
+| 4.725693727250069 | 49994.56852674231 | 24945.38563793553 |
++-------------------+-------------------+-------------------+
+[localhost:21000] > select appx_median(x) from million_numbers;
++----------------+
+| appx_median(x) |
++----------------+
+| 24721.6 |
++----------------+
+[localhost:21000] > select count(x) as higher from million_numbers where x > (select appx_median(x) from million_numbers);
++--------+
+| higher |
++--------+
+| 502013 |
++--------+
+[localhost:21000] > select count(x) as lower from million_numbers where x < (select appx_median(x) from million_numbers);
++--------+
+| lower |
++--------+
+| 497987 |
++--------+
+</codeblock>
+
+ <p>
+ The following example computes the approximate median using a subset of the values from the table, and then
+ confirms that the result is a reasonable estimate for the midpoint.
+ </p>
+
+<codeblock>[localhost:21000] > select appx_median(x) from million_numbers where x between 1000 and 5000;
++-------------------+
+| appx_median(x) |
++-------------------+
+| 3013.107787358159 |
++-------------------+
+[localhost:21000] > select count(x) as higher from million_numbers where x between 1000 and 5000 and x > 3013.107787358159;
++--------+
+| higher |
++--------+
+| 37692 |
++--------+
+[localhost:21000] > select count(x) as lower from million_numbers where x between 1000 and 5000 and x < 3013.107787358159;
++-------+
+| lower |
++-------+
+| 37089 |
++-------+
+</codeblock>
+ </conbody>
+</concept>
[17/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Posted by jr...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_complex_types.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_complex_types.xml b/docs/topics/impala_complex_types.xml
new file mode 100644
index 0000000..9fe7362
--- /dev/null
+++ b/docs/topics/impala_complex_types.xml
@@ -0,0 +1,2725 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="2.3.0" id="complex_types">
+
+ <title id="nested_types">Complex Types (CDH 5.5 and higher only)</title>
+
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Data Types"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">complex types</indexterm>
+
+ <indexterm audience="Cloudera">nested types</indexterm>
+ <term>Complex types</term> (also referred to as <term>nested types</term>) let you represent multiple data values within a single
+ row/column position. They differ from the familiar column types such as <codeph>BIGINT</codeph> and <codeph>STRING</codeph>, known as
+ <term>scalar types</term> or <term>primitive types</term>, which represent a single data value within a given row/column position.
+ Impala supports the complex types <codeph>ARRAY</codeph>, <codeph>MAP</codeph>, and <codeph>STRUCT</codeph> in Impala 2.3 / CDH 5.5
+ and higher. The Hive <codeph>UNION</codeph> type is not currently supported.
+ </p>
+
+ <p outputclass="toc inpage"/>
+
+ <p>
+ Once you understand the basics of complex types, refer to the individual type topics when you need to refresh your memory about syntax
+ and examples:
+ </p>
+
+ <ul>
+ <li>
+ <xref href="impala_array.xml#array"/>
+ </li>
+
+ <li>
+ <xref href="impala_struct.xml#struct"/>
+ </li>
+
+ <li>
+ <xref href="impala_map.xml#map"/>
+ </li>
+ </ul>
+
+ </conbody>
+
+ <concept id="complex_types_benefits">
+
+ <title>Benefits of Impala Complex Types</title>
+
+ <conbody>
+
+ <p>
+ The reasons for using Impala complex types include the following:
+ </p>
+
+ <ul>
+ <li>
+ <p>
+ You already have data produced by Hive or other non-Impala component that uses the complex type column names. You might need to
+ convert the underlying data to Parquet to use it with Impala.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Your data model originates with a non-SQL programming language or a NoSQL data management system. For example, if you are
+ representing Python data expressed as nested lists, dictionaries, and tuples, those data structures correspond closely to Impala
+ <codeph>ARRAY</codeph>, <codeph>MAP</codeph>, and <codeph>STRUCT</codeph> types.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Your analytic queries involving multiple tables could benefit from greater locality during join processing. By packing more
+ related data items within each HDFS data block, complex types let join queries avoid the network overhead of the traditional
+ Hadoop shuffle or broadcast join techniques.
+ </p>
+ </li>
+ </ul>
+
+ <p>
+ The Impala complex type support produces result sets with all scalar values, and the scalar components of complex types can be used
+ with all SQL clauses, such as <codeph>GROUP BY</codeph>, <codeph>ORDER BY</codeph>, all kinds of joins, subqueries, and inline
+ views. The ability to process complex type data entirely in SQL reduces the need to write application-specific code in Java or other
+ programming languages to deconstruct the underlying data structures.
+ </p>
+
+ </conbody>
+
+ </concept>
+
+ <concept id="complex_types_overview">
+
+ <title>Overview of Impala Complex Types</title>
+
+ <conbody>
+
+ <p>
+<!--
+ Each <codeph>ARRAY</codeph>, <codeph>MAP</codeph>, or <codeph>STRUCT</codeph> column can include multiple instances of scalar types
+ such as <codeph>BIGINT</codeph> and <codeph>STRING</codeph>.
+-->
+ The <codeph>ARRAY</codeph> and <codeph>MAP</codeph> types are closely related: they represent collections with arbitrary numbers of
+ elements, where each element is the same type. In contrast, <codeph>STRUCT</codeph> groups together a fixed number of items into a
+ single element. The parts of a <codeph>STRUCT</codeph> element (the <term>fields</term>) can be of different types, and each field
+ has a name.
+ </p>
+
+ <p>
+ The elements of an <codeph>ARRAY</codeph> or <codeph>MAP</codeph>, or the fields of a <codeph>STRUCT</codeph>, can also be other
+ complex types. You can construct elaborate data structures with up to 100 levels of nesting. For example, you can make an
+ <codeph>ARRAY</codeph> whose elements are <codeph>STRUCT</codeph>s. Within each <codeph>STRUCT</codeph>, you can have some fields
+ that are <codeph>ARRAY</codeph>, <codeph>MAP</codeph>, or another kind of <codeph>STRUCT</codeph>. The Impala documentation uses the
+ terms complex and nested types interchangeably; for simplicity, it primarily uses the term complex types, to encompass all the
+ properties of these types.
+ </p>
+
+ <p>
+ When visualizing your data model in familiar SQL terms, you can think of each <codeph>ARRAY</codeph> or <codeph>MAP</codeph> as a
+ miniature table, and each <codeph>STRUCT</codeph> as a row within such a table. By default, the table represented by an
+ <codeph>ARRAY</codeph> has two columns, <codeph>POS</codeph> to represent ordering of elements, and <codeph>ITEM</codeph>
+ representing the value of each element. Likewise, by default, the table represented by a <codeph>MAP</codeph> encodes key-value
+ pairs, and therefore has two columns, <codeph>KEY</codeph> and <codeph>VALUE</codeph>.
+<!--
+ When you use a <codeph>STRUCT</codeph> as an
+ <codeph>ARRAY</codeph> element or the <codeph>VALUE</codeph> part of a <codeph>MAP</codeph>, the field names of the
+ <codeph>STRUCT</codeph> become additional columns in the result set.
+-->
+ </p>
+
+ <p>
+ The <codeph>ITEM</codeph> and <codeph>VALUE</codeph> names are only required for the very simplest kinds of <codeph>ARRAY</codeph>
+ and <codeph>MAP</codeph> columns, ones that hold only scalar values. When the elements within the <codeph>ARRAY</codeph> or
+ <codeph>MAP</codeph> are of type <codeph>STRUCT</codeph> rather than a scalar type, then the result set contains columns with names
+ corresponding to the <codeph>STRUCT</codeph> fields rather than <codeph>ITEM</codeph> or <codeph>VALUE</codeph>.
+ </p>
+
+<!--
+ <p>
+ <codeph>ARRAY</codeph> and <codeph>MAP</codeph> are both <term>collection</term> types, which can have a variable number of
+ elements; <codeph>ARRAY</codeph> and <codeph>MAP</codeph> are typically used as the top-level type of a table column.
+ <codeph>STRUCT</codeph> represents a single element and has a fixed number of fields; <codeph>STRUCT</codeph> is typically used as
+ the final, lowest level of a nested type definition.
+ </p>
+-->
+
+ <p>
+ You write most queries that process complex type columns using familiar join syntax, even though the data for both sides of the join
+ resides in a single table. The join notation brings together the scalar values from a row with the values from the complex type
+ columns for that same row. The final result set contains all scalar values, allowing you to do all the familiar filtering,
+ aggregation, ordering, and so on for the complex data entirely in SQL or using business intelligence tools that issue SQL queries.
+<!--
+ Instead of pulling together values from different tables, the join selects the specified values from both
+ the scalar columns, and from inside the complex type columns, producing a flattened result set consisting of all scalar values. When
+ doing a join query involving a complex type column, Impala derives the join key automatically, without the need to create additional
+ ID columns in the table.
+-->
+ </p>
+
+ <p>
+ Behind the scenes, Impala ensures that the processing for each row is done efficiently on a single host, without the network traffic
+ involved in broadcast or shuffle joins. The most common type of join query for tables with complex type columns is <codeph>INNER
+ JOIN</codeph>, which returns results only in those cases where the complex type contains some elements. Therefore, most query
+ examples in this section use either the <codeph>INNER JOIN</codeph> clause or the equivalent comma notation.
+ </p>
+
+ <note>
+ <p>
+ Although Impala can query complex types that are present in Parquet files, Impala currently cannot create new Parquet files
+ containing complex types. Therefore, the discussion and examples presume that you are working with existing Parquet data produced
+ through Hive, Spark, or some other source. See <xref href="#complex_types_ex_hive_etl"/> for examples of constructing Parquet data
+ files with complex type columns.
+ </p>
+
+ <p>
+ For learning purposes, you can create empty tables with complex type columns and practice query syntax, even if you do not have
+ sample data with the required structure.
+ </p>
+ </note>
+
+ </conbody>
+
+ </concept>
+
+ <concept id="complex_types_design">
+
+ <title>Design Considerations for Complex Types</title>
+
+ <conbody>
+
+ <p>
+ When planning to use Impala complex types, and designing the Impala schema, first learn how this kind of schema differs from
+ traditional table layouts from the relational database and data warehousing fields. Because you might have already encountered
+ complex types in a Hadoop context while using Hive for ETL, also learn how to write high-performance analytic queries for complex
+ type data using Impala SQL syntax.
+ </p>
+
+ <p outputclass="toc inpage"/>
+
+ </conbody>
+
+ <concept id="complex_types_vs_rdbms">
+
+ <title>How Complex Types Differ from Traditional Data Warehouse Schemas</title>
+
+ <conbody>
+
+ <p>
+ Complex types let you associate arbitrary data structures with a particular row. If you are familiar with schema design for
+ relational database management systems or data warehouses, a schema with complex types has the following differences:
+ </p>
+
+ <ul>
+ <li>
+ <p>
+ Logically, related values can now be grouped tightly together in the same table.
+ </p>
+
+ <p>
+ In traditional data warehousing, related values were typically arranged in one of two ways:
+ </p>
+ <ul>
+ <li>
+ <p>
+ Split across multiple normalized tables. Foreign key columns specified which rows from each table were associated with
+ each other. This arrangement avoided duplicate data and therefore the data was compact, but join queries could be
+ expensive because the related data had to be retrieved from separate locations. (In the case of distributed Hadoop
+ queries, the joined tables might even be transmitted between different hosts in a cluster.)
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Flattened into a single denormalized table. Although this layout eliminated some potential performance issues by removing
+ the need for join queries, the table typically became larger because values were repeated. The extra data volume could
+ cause performance issues in other parts of the workflow, such as longer ETL cycles or more expensive full-table scans
+ during queries.
+ </p>
+ </li>
+ </ul>
+ <p>
+ Complex types represent a middle ground that addresses these performance and volume concerns. By physically locating related
+ data within the same data files, complex types increase locality and reduce the expense of join queries. By associating an
+ arbitrary amount of data with a single row, complex types avoid the need to repeat lengthy values such as strings. Because
+ Impala knows which complex type values are associated with each row, you can save storage by avoiding artificial foreign key
+ values that are only used for joins. The flexibility of the <codeph>STRUCT</codeph>, <codeph>ARRAY</codeph>, and
+ <codeph>MAP</codeph> types lets you model familiar constructs such as fact and dimension tables from a data warehouse, and
+ wide tables representing sparse matrixes.
+ </p>
+ </li>
+ </ul>
+
+ </conbody>
+
+ </concept>
+
+ <concept id="complex_types_physical">
+
+ <title>Physical Storage for Complex Types</title>
+
+ <conbody>
+
+ <p>
+ Physically, the scalar and complex columns in each row are located adjacent to each other in the same Parquet data file, ensuring
+ that they are processed on the same host rather than being broadcast across the network when cross-referenced within a query. This
+ co-location simplifies the process of copying, converting, and backing all the columns up at once. Because of the column-oriented
+ layout of Parquet files, you can still query only the scalar columns of a table without imposing the I/O penalty of reading the
+ (possibly large) values of the composite columns.
+ </p>
+
+ <p>
+ Within each Parquet data file, the constituent parts of complex type columns are stored in column-oriented format:
+ </p>
+
+ <ul>
+ <li>
+ <p>
+ Each field of a <codeph>STRUCT</codeph> type is stored like a column, with all the scalar values adjacent to each other and
+ encoded, compressed, and so on using the Parquet space-saving techniques.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ For an <codeph>ARRAY</codeph> containing scalar values, all those values (represented by the <codeph>ITEM</codeph>
+ pseudocolumn) are stored adjacent to each other.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ For a <codeph>MAP</codeph>, the values of the <codeph>KEY</codeph> pseudocolumn are stored adjacent to each other. If the
+ <codeph>VALUE</codeph> pseudocolumn is a scalar type, its values are also stored adjacent to each other.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ If an <codeph>ARRAY</codeph> element, <codeph>STRUCT</codeph> field, or <codeph>MAP</codeph> <codeph>VALUE</codeph> part is
+ another complex type, the column-oriented storage applies to the next level down (or the next level after that, and so on for
+ deeply nested types) where the final elements, fields, or values are of scalar types.
+ </p>
+ </li>
+ </ul>
+
+ <p>
+ The numbers represented by the <codeph>POS</codeph> pseudocolumn of an <codeph>ARRAY</codeph> are not physically stored in the
+ data files. They are synthesized at query time based on the order of the <codeph>ARRAY</codeph> elements associated with each row.
+ </p>
+
+ </conbody>
+
+ </concept>
+
+ <concept id="complex_types_file_formats">
+
+ <title>File Format Support for Impala Complex Types</title>
+
+ <conbody>
+
+ <p>
+ Currently, Impala queries support complex type data only in the Parquet file format. See <xref href="impala_parquet.xml#parquet"/>
+ for details about the performance benefits and physical layout of this file format.
+ </p>
+
+ <p>
+ Each table, or each partition within a table, can have a separate file format, and you can change file format at the table or
+ partition level through an <codeph>ALTER TABLE</codeph> statement. Because this flexibility makes it difficult to guarantee ahead
+ of time that all the data files for a table or partition are in a compatible format, Impala does not throw any errors when you
+ change the file format for a table or partition using <codeph>ALTER TABLE</codeph>. Any errors come at runtime when Impala
+ actually processes a table or partition that contains nested types and is not in one of the supported formats. If a query on a
+ partitioned table only processes some partitions, and all those partitions are in one of the supported formats, the query
+ succeeds.
+ </p>
+
+ <p>
+ Because Impala does not parse the data structures containing nested types for unsupported formats such as text, Avro,
+ SequenceFile, or RCFile, you cannot use data files in these formats with Impala, even if the query does not refer to the nested
+ type columns. Also, if a table using an unsupported format originally contained nested type columns, and then those columns were
+ dropped from the table using <codeph>ALTER TABLE ... DROP COLUMN</codeph>, any existing data files in the table still contain the
+ nested type data and Impala queries on that table will generate errors.
+ </p>
+
+ <p>
+ You can perform DDL operations (even <codeph>CREATE TABLE</codeph>) for tables involving complex types in file formats other than
+ Parquet. The DDL support lets you set up intermediate tables in your ETL pipeline, to be populated by Hive, before the final stage
+ where the data resides in a Parquet table and is queryable by Impala. Also, you can have a partitioned table with complex type
+ columns that uses a non-Parquet format, and use <codeph>ALTER TABLE</codeph> to change the file format to Parquet for individual
+ partitions. When you put Parquet data files into those partitions, Impala can execute queries against that data as long as the
+ query does not involve any of the non-Parquet partitions.
+ </p>
+
+ <p>
+ If you use the <cmdname>parquet-tools</cmdname> command to examine the structure of a Parquet data file that includes complex
+ types, you see that both <codeph>ARRAY</codeph> and <codeph>MAP</codeph> are represented as a <codeph>Bag</codeph> in Parquet
+ terminology, with all fields marked <codeph>Optional</codeph> because Impala allows any column to be nullable.
+ </p>
+
+ <p>
+ Impala supports either 2-level and 3-level encoding within each Parquet data file. When constructing Parquet data files outside
+ Impala, use either encoding style but do not mix 2-level and 3-level encoding within the same data file.
+ </p>
+
+ </conbody>
+
+ </concept>
+
+ <concept id="complex_types_vs_normalization">
+
+ <title>Choosing Between Complex Types and Normalized Tables</title>
+
+ <conbody>
+
+ <p>
+ Choosing between multiple normalized fact and dimension tables, or a single table containing complex types, is an important design
+ decision.
+ </p>
+
+ <ul>
+ <li>
+ <p>
+ If you are coming from a traditional database or data warehousing background, you might be familiar with how to split up data
+ between tables. Your business intelligence tools might already be optimized for dealing with this kind of multi-table scenario
+ through join queries.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ If you are pulling data from Impala into an application written in a programming language that has data structures analogous
+ to the complex types, such as Python or Java, complex types in Impala could simplify data interchange and improve
+ understandability and reliability of your program logic.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ You might already be faced with existing infrastructure or receive high volumes of data that assume one layout or the other.
+ For example, complex types are popular with web-oriented applications, for example to keep information about an online user
+ all in one place for convenient lookup and analysis, or to deal with sparse or constantly evolving data fields.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ If some parts of the data change over time while related data remains constant, using multiple normalized tables lets you
+ replace certain parts of the data without reloading the entire data set. Conversely, if you receive related data all bundled
+ together, such as in JSON files, using complex types can save the overhead of splitting the related items across multiple
+ tables.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ From a performance perspective:
+ </p>
+ <ul>
+ <li>
+ <p>
+ In Parquet tables, Impala can skip columns that are not referenced in a query, avoiding the I/O penalty of reading the
+ embedded data. When complex types are nested within a column, the data is physically divided at a very granular level; for
+ example, a query referring to data nested multiple levels deep in a complex type column does not have to read all the data
+ from that column, only the data for the relevant parts of the column type hierarchy.
+<!-- Avro not supported in 5.5 / 2.3: Avro tables might experience some performance overhead due to
+ the need to skip past the complex type columns in each row when reading the data. -->
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Complex types avoid the possibility of expensive join queries when data from fact and dimension tables is processed in
+ parallel across multiple hosts. All the information for a row containing complex types is typically to be in the same data
+ block, and therefore does not need to be transmitted across the network when joining fields that are all part of the same
+ row.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ The tradeoff with complex types is that fewer rows fit in each data block. Whether it is better to have more data blocks
+ with fewer rows, or fewer data blocks with many rows, depends on the distribution of your data and the characteristics of
+ your query workload. If the complex columns are rarely referenced, using them might lower efficiency. If you are seeing
+ low parallelism due to a small volume of data (relatively few data blocks) in each table partition, increasing the row
+ size by including complex columns might produce more data blocks and thus spread the work more evenly across the cluster.
+ See <xref href="impala_scalability.xml#scalability"/> for more on this advanced topic.
+ </p>
+ </li>
+ </ul>
+ </li>
+ </ul>
+
+ </conbody>
+
+ </concept>
+
+ <concept id="complex_types_hive">
+
+ <title>Differences Between Impala and Hive Complex Types</title>
+
+ <conbody>
+
+<!-- HiveQL functions like nested type constructors and posexplode(): https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF -->
+
+<!-- HiveQL complex types: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Types#LanguageManualTypes-ComplexTypes -->
+
+<!-- HiveQL lateral views: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+LateralView -->
+
+ <p>
+ Impala can query Parquet tables containing <codeph>ARRAY</codeph>, <codeph>STRUCT</codeph>, and <codeph>MAP</codeph> columns
+ produced by Hive. There are some differences to be aware of between the Impala SQL and HiveQL syntax for complex types, primarily
+ for queries.
+ </p>
+
+ <p>
+ The syntax for specifying <codeph>ARRAY</codeph>, <codeph>STRUCT</codeph>, and <codeph>MAP</codeph> types in a <codeph>CREATE
+ TABLE</codeph> statement is compatible between Impala and Hive.
+ </p>
+
+ <p>
+ Because Impala <codeph>STRUCT</codeph> columns include user-specified field names, you use the <codeph>NAMED_STRUCT()</codeph>
+ constructor in Hive rather than the <codeph>STRUCT()</codeph> constructor when you populate an Impala <codeph>STRUCT</codeph>
+ column using a Hive <codeph>INSERT</codeph> statement.
+ </p>
+
+ <p>
+ The Hive <codeph>UNION</codeph> type is not currently supported in Impala.
+ </p>
+
+ <p>
+ While Impala usually aims for a high degree of compatibility with HiveQL query syntax, Impala syntax differs from Hive for queries
+ involving complex types. The differences are intended to provide extra flexibility for queries involving these kinds of tables.
+ </p>
+
+ <ul>
+ <li>
+ Impala uses dot notation for referring to element names or elements within complex types, and join notation for
+ cross-referencing scalar columns with the elements of complex types within the same row, rather than the <codeph>LATERAL
+ VIEW</codeph> clause and <codeph>EXPLODE()</codeph> function of HiveQL.
+ </li>
+
+ <li>
+ Using join notation lets you use all the kinds of join queries with complex type columns. For example, you can use a
+ <codeph>LEFT OUTER JOIN</codeph>, <codeph>LEFT ANTI JOIN</codeph>, or <codeph>LEFT SEMI JOIN</codeph> query to evaluate
+ different scenarios where the complex columns do or do not contain any elements.
+ </li>
+
+ <li>
+ You can include references to collection types inside subqueries and inline views. For example, you can construct a
+ <codeph>FROM</codeph> clause where one of the <q>tables</q> is a subquery against a complex type column, or use a subquery
+ against a complex type column as the argument to an <codeph>IN</codeph> or <codeph>EXISTS</codeph> clause.
+ </li>
+
+ <li>
+ The Impala pseudocolumn <codeph>POS</codeph> lets you retrieve the position of elements in an array along with the elements
+ themselves, equivalent to the <codeph>POSEXPLODE()</codeph> function of HiveQL. You do not use index notation to retrieve a
+ single array element in a query; the join query loops through the array elements and you use <codeph>WHERE</codeph> clauses to
+ specify which elements to return.
+ </li>
+
+ <li>
+ <p>
+ Join clauses involving complex type columns do not require an <codeph>ON</codeph> or <codeph>USING</codeph> clause. Impala
+ implicitly applies the join key so that the correct array entries or map elements are associated with the correct row from the
+ table.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Impala does not currently support the <codeph>UNION</codeph> complex type.
+ </p>
+ </li>
+ </ul>
+
+ </conbody>
+
+ </concept>
+
+ <concept id="complex_types_limits">
+
+ <title>Limitations and Restrictions for Complex Types</title>
+
+ <conbody>
+
+ <p>
+ Complex type columns can only be used in tables or partitions with the Parquet file format.
+ </p>
+
+ <p>
+ Complex type columns cannot be used as partition key columns in a partitioned table.
+ </p>
+
+ <p>
+ When you use complex types with the <codeph>ORDER BY</codeph>, <codeph>GROUP BY</codeph>, <codeph>HAVING</codeph>, or
+ <codeph>WHERE</codeph> clauses, you cannot refer to the column name by itself. Instead, you refer to the names of the scalar
+ values within the complex type, such as the <codeph>ITEM</codeph>, <codeph>POS</codeph>, <codeph>KEY</codeph>, or
+ <codeph>VALUE</codeph> pseudocolumns, or the field names from a <codeph>STRUCT</codeph>.
+ </p>
+
+ <p>
+ The maximum depth of nesting for complex types is 100 levels.
+ </p>
+
+ <p>
+ For ideal performance and scalability, use small or medium-sized collections, where all the complex columns contain at most a few
+ hundred megabytes per row. Remember, all the columns of a row are stored in the same HDFS data block, whose size in Parquet files
+ typically ranges from 256 MB to 1 GB.
+ </p>
+
+ <p>
+ Including complex type columns in a table introduces some overhead that might make queries that do not reference those columns
+ somewhat slower than Impala queries against tables without any complex type columns. Expect at most a 2x slowdown compared to
+ tables that do not have any complex type columns.
+ </p>
+
+ <p>
+ Currently, the <codeph>COMPUTE STATS</codeph> statement does not collect any statistics for columns containing complex types.
+ Impala uses heuristics to construct execution plans involving complex type columns.
+ </p>
+
+ <p>
+ Currently, Impala built-in functions and user-defined functions cannot accept complex types as parameters or produce them as
+ function return values. (When the complex type values are materialized in an Impala result set, the result set contains the scalar
+ components of the values, such as the <codeph>POS</codeph> or <codeph>ITEM</codeph> for an <codeph>ARRAY</codeph>, the
+ <codeph>KEY</codeph> or <codeph>VALUE</codeph> for a <codeph>MAP</codeph>, or the fields of a <codeph>STRUCT</codeph>; these
+ scalar data items <i>can</i> be used with built-in functions and UDFs as usual.)
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_read_only"/>
+
+ <p>
+ Currently, Impala can query complex type columns only from Parquet tables or Parquet partitions within partitioned tables.
+ Although you can use complex types in tables with Avro, text, and other file formats as part of your ETL pipeline, for example as
+ intermediate tables populated through Hive, doing analytics through Impala requires that the data eventually ends up in a Parquet
+ table. The requirement for Parquet data files means that you can use complex types with Impala tables hosted on other kinds of
+ file storage systems such as Isilon and Amazon S3, but you cannot use Impala to query complex types from HBase tables. See
+ <xref href="impala_complex_types.xml#complex_types_file_formats"/> for more details.
+ </p>
+
+ </conbody>
+
+ </concept>
+
+ </concept>
+
+ <concept id="complex_types_using">
+
+ <title>Using Complex Types from SQL</title>
+
+ <conbody>
+
+ <p>
+ When using complex types through SQL in Impala, you learn the notation for <codeph>< ></codeph> delimiters for the complex
+ type columns in <codeph>CREATE TABLE</codeph> statements, and how to construct join queries to <q>unpack</q> the scalar values
+ nested inside the complex data structures. You might need to condense a traditional RDBMS or data warehouse schema into a smaller
+ number of Parquet tables, and use Hive, Spark, Pig, or other mechanism outside Impala to populate the tables with data.
+ </p>
+
+ <p outputclass="toc inpage"/>
+
+ </conbody>
+
+ <concept id="nested_types_ddl">
+
+ <title>Complex Type Syntax for DDL Statements</title>
+
+ <conbody>
+
+ <p>
+ The definition of <varname>data_type</varname>, as seen in the <codeph>CREATE TABLE</codeph> and <codeph>ALTER TABLE</codeph>
+ statements, now includes complex types in addition to primitive types:
+ </p>
+
+<codeblock> primitive_type
+| array_type
+| map_type
+| struct_type
+</codeblock>
+
+ <p>
+ Unions are not currently supported.
+ </p>
+
+ <p>
+ Array, struct, and map column type declarations are specified in the <codeph>CREATE TABLE</codeph> statement. You can also add or
+ change the type of complex columns through the <codeph>ALTER TABLE</codeph> statement.
+ </p>
+
+ <note>
+ <p>
+ Currently, Impala queries allow complex types only in tables that use the Parquet format. If an Impala query encounters complex
+ types in a table or partition using another file format, the query returns a runtime error.
+ </p>
+
+ <p>
+ The Impala DDL support for complex types works for all file formats, so that you can create tables using text or other
+ non-Parquet formats for Hive to use as staging tables in an ETL cycle that ends with the data in a Parquet table. You can also
+ use <codeph>ALTER TABLE ... SET FILEFORMAT PARQUET</codeph> to change the file format of an existing table containing complex
+ types to Parquet, after which Impala can query it. Make sure to load Parquet files into the table after changing the file
+ format, because the <codeph>ALTER TABLE ... SET FILEFORMAT</codeph> statement does not convert existing data to the new file
+ format.
+ </p>
+ </note>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_partitioning"/>
+
+ <p>
+ Because use cases for Impala complex types require that you already have Parquet data files produced outside of Impala, you can
+ use the Impala <codeph>CREATE TABLE LIKE PARQUET</codeph> syntax to produce a table with columns that match the structure of an
+ existing Parquet file, including complex type columns for nested data structures. Remember to include the <codeph>STORED AS
+ PARQUET</codeph> clause in this case, because even with <codeph>CREATE TABLE LIKE PARQUET</codeph>, the default file format of the
+ resulting table is still text.
+ </p>
+
+ <p>
+ Because the complex columns are omitted from the result set of an Impala <codeph>SELECT *</codeph> or <codeph>SELECT
+ <varname>col_name</varname></codeph> query, and because Impala currently does not support writing Parquet files with complex type
+ columns, you cannot use the <codeph>CREATE TABLE AS SELECT</codeph> syntax to create a table with nested type columns.
+ </p>
+
+ <note>
+ <p>
+ Once you have a table set up with complex type columns, use the <codeph>DESCRIBE</codeph> and <codeph>SHOW CREATE TABLE</codeph>
+ statements to see the correct notation with <codeph><</codeph> and <codeph>></codeph> delimiters and comma and colon
+ separators within the complex type definitions. If you do not have existing data with the same layout as the table, you can
+ query the empty table to practice with the notation for the <codeph>SELECT</codeph> statement. In the <codeph>SELECT</codeph>
+ list, you use dot notation and pseudocolumns such as <codeph>ITEM</codeph>, <codeph>KEY</codeph>, and <codeph>VALUE</codeph> for
+ referring to items within the complex type columns. In the <codeph>FROM</codeph> clause, you use join notation to construct
+ table aliases for any referenced <codeph>ARRAY</codeph> and <codeph>MAP</codeph> columns.
+ </p>
+ </note>
+
+<!-- To do: show some simple CREATE TABLE statements for each of the complex types, without so much backstory for the schema. -->
+
+ <p>
+ For example, when defining a table that holds contact information, you might represent phone numbers differently depending on the
+ expected layout and relationships of the data, and how well you can predict those properties in advance.
+ </p>
+
+ <p>
+ Here are different ways that you might represent phone numbers in a traditional relational schema, with equivalent representations
+ using complex types.
+ </p>
+
+ <fig id="complex_types_phones_flat_fixed">
+
+ <title>Traditional Relational Representation of Phone Numbers: Single Table</title>
+
+ <p>
+ The traditional, simplest way to represent phone numbers in a relational table is to store all contact info in a single table,
+ with all columns having scalar types, and each potential phone number represented as a separate column. In this example, each
+ person can only have these 3 types of phone numbers. If the person does not have a particular kind of phone number, the
+ corresponding column is <codeph>NULL</codeph> for that row.
+ </p>
+
+<codeblock>
+CREATE TABLE contacts_fixed_phones
+(
+ id BIGINT
+ , name STRING
+ , address STRING
+ , home_phone STRING
+ , work_phone STRING
+ , mobile_phone STRING
+) STORED AS PARQUET;
+</codeblock>
+
+ </fig>
+
+ <fig id="complex_types_phones_array">
+
+ <title>An Array of Phone Numbers</title>
+
+ <p>
+ Using a complex type column to represent the phone numbers adds some extra flexibility. Now there could be an unlimited number
+ of phone numbers. Because the array elements have an order but not symbolic names, you could decide in advance that
+ phone_number[0] is the home number, [1] is the work number, [2] is the mobile number, and so on. (In subsequent examples, you
+ will see how to create a more flexible naming scheme using other complex type variations, such as a <codeph>MAP</codeph> or an
+ <codeph>ARRAY</codeph> where each element is a <codeph>STRUCT</codeph>.)
+ </p>
+
+<codeblock><![CDATA[
+CREATE TABLE contacts_array_of_phones
+(
+ id BIGINT
+ , name STRING
+ , address STRING
+ , phone_number ARRAY < STRING >
+) STORED AS PARQUET;
+]]>
+</codeblock>
+
+ </fig>
+
+ <fig id="complex_types_phones_map">
+
+ <title>A Map of Phone Numbers</title>
+
+ <p>
+ Another way to represent an arbitrary set of phone numbers is with a <codeph>MAP</codeph> column. With a <codeph>MAP</codeph>,
+ each element is associated with a key value that you specify, which could be a numeric, string, or other scalar type. This
+ example uses a <codeph>STRING</codeph> key to give each phone number a name, such as <codeph>'home'</codeph> or
+ <codeph>'mobile'</codeph>. A query could filter the data based on the key values, or display the key values in reports.
+ </p>
+
+<codeblock><![CDATA[
+CREATE TABLE contacts_unlimited_phones
+(
+ id BIGINT, name STRING, address STRING, phone_number MAP < STRING,STRING >
+) STORED AS PARQUET;
+]]>
+</codeblock>
+
+ </fig>
+
+ <fig id="complex_types_phones_flat_normalized">
+
+ <title>Traditional Relational Representation of Phone Numbers: Normalized Tables</title>
+
+ <p>
+ If you are an experienced database designer, you already know how to work around the limitations of the single-table schema from
+ <xref href="#nested_types_ddl/complex_types_phones_flat_fixed"/>. By normalizing the schema, with the phone numbers in their own
+ table, you can associate an arbitrary set of phone numbers with each person, and associate additional details with each phone
+ number, such as whether it is a home, work, or mobile phone.
+ </p>
+
+ <p>
+ The flexibility of this approach comes with some drawbacks. Reconstructing all the data for a particular person requires a join
+ query, which might require performance tuning on Hadoop because the data from each table might be transmitted from a different
+ host. Data management tasks such as backups and refreshing the data require dealing with multiple tables instead of a single
+ table.
+ </p>
+
+ <p>
+ This example illustrates a traditional database schema to store contact info normalized across 2 tables. The fact table
+ establishes the identity and basic information about person. A dimension table stores information only about phone numbers,
+ using an ID value to associate each phone number with a person ID from the fact table. Each person can have 0, 1, or many
+ phones; the categories are not restricted to a few predefined ones; and the phone table can contain as many columns as desired,
+ to represent all sorts of details about each phone number.
+ </p>
+
+<codeblock>
+CREATE TABLE fact_contacts (id BIGINT, name STRING, address STRING) STORED AS PARQUET;
+CREATE TABLE dim_phones
+(
+ contact_id BIGINT
+ , category STRING
+ , international_code STRING
+ , area_code STRING
+ , exchange STRING
+ , extension STRING
+ , mobile BOOLEAN
+ , carrier STRING
+ , current BOOLEAN
+ , service_start_date TIMESTAMP
+ , service_end_date TIMESTAMP
+)
+STORED AS PARQUET;
+</codeblock>
+
+ </fig>
+
+ <fig id="complex_types_phones_array_struct">
+
+ <title>Phone Numbers Represented as an Array of Structs</title>
+
+ <p>
+ To represent a schema equivalent to the one from <xref href="#nested_types_ddl/complex_types_phones_flat_normalized"/> using
+ complex types, this example uses an <codeph>ARRAY</codeph> where each array element is a <codeph>STRUCT</codeph>. As with the
+ earlier complex type examples, each person can have an arbitrary set of associated phone numbers. Making each array element into
+ a <codeph>STRUCT</codeph> lets us associate multiple data items with each phone number, and give a separate name and type to
+ each data item. The <codeph>STRUCT</codeph> fields of the <codeph>ARRAY</codeph> elements reproduce the columns of the dimension
+ table from the previous example.
+ </p>
+
+ <p>
+ You can do all the same kinds of queries with the complex type schema as with the normalized schema from the previous example.
+ The advantages of the complex type design are in the areas of convenience and performance. Now your backup and ETL processes
+ only deal with a single table. When a query uses a join to cross-reference the information about a person with their associated
+ phone numbers, all the relevant data for each row resides in the same HDFS data block, meaning each row can be processed on a
+ single host without requiring network transmission.
+ </p>
+
+<codeblock><![CDATA[
+CREATE TABLE contacts_detailed_phones
+(
+ id BIGINT, name STRING, address STRING
+ , phone ARRAY < STRUCT <
+ category: STRING
+ , international_code: STRING
+ , area_code: STRING
+ , exchange: STRING
+ , extension: STRING
+ , mobile: BOOLEAN
+ , carrier: STRING
+ , current: BOOLEAN
+ , service_start_date: TIMESTAMP
+ , service_end_date: TIMESTAMP
+ >>
+) STORED AS PARQUET;
+]]>
+</codeblock>
+
+ </fig>
+
+ </conbody>
+
+ </concept>
+
+ <concept id="complex_types_sql">
+
+ <title>SQL Statements that Support Complex Types</title>
+
+ <conbody>
+
+ <p>
+ The Impala SQL statements that support complex types are currently
+ <codeph><xref href="impala_create_table.xml#create_table">CREATE TABLE</xref></codeph>,
+ <codeph><xref href="impala_alter_table.xml#alter_table">ALTER TABLE</xref></codeph>,
+ <codeph><xref href="impala_describe.xml#describe">DESCRIBE</xref></codeph>,
+ <codeph><xref href="impala_load_data.xml#load_data">LOAD DATA</xref></codeph>, and
+ <codeph><xref href="impala_select.xml#select">SELECT</xref></codeph>. That is, currently Impala can create or alter tables
+ containing complex type columns, examine the structure of a table containing complex type columns, import existing data files
+ containing complex type columns into a table, and query Parquet tables containing complex types.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_read_only"/>
+
+ <p outputclass="toc inpage"/>
+
+ </conbody>
+
+ <concept id="complex_types_ddl">
+
+ <title>DDL Statements and Complex Types</title>
+
+ <conbody>
+
+ <p>
+ Column specifications for complex or nested types use <codeph><</codeph> and <codeph>></codeph> delimiters:
+ </p>
+
+<codeblock><![CDATA[-- What goes inside the < > for an ARRAY is a single type, either a scalar or another
+-- complex type (ARRAY, STRUCT, or MAP).
+CREATE TABLE array_t
+(
+ id BIGINT,
+ a1 ARRAY <STRING>,
+ a2 ARRAY <BIGINT>,
+ a3 ARRAY <TIMESTAMP>,
+ a4 ARRAY <STRUCT <f1: STRING, f2: INT, f3: BOOLEAN>>
+)
+STORED AS PARQUET;
+
+-- What goes inside the < > for a MAP is two comma-separated types specifying the types of the key-value pair:
+-- a scalar type representing the key, and a scalar or complex type representing the value.
+CREATE TABLE map_t
+(
+ id BIGINT,
+ m1 MAP <STRING, STRING>,
+ m2 MAP <STRING, BIGINT>,
+ m3 MAP <BIGINT, STRING>,
+ m4 MAP <BIGINT, BIGINT>,
+ m5 MAP <STRING, ARRAY <STRING>>
+)
+STORED AS PARQUET;
+
+-- What goes inside the < > for a STRUCT is a comma-separated list of fields, each field defined as
+-- name:type. The type can be a scalar or a complex type. The field names for each STRUCT do not clash
+-- with the names of table columns or fields in other STRUCTs. A STRUCT is most often used inside
+-- an ARRAY or a MAP rather than as a top-level column.
+CREATE TABLE struct_t
+(
+ id BIGINT,
+ s1 STRUCT <f1: STRING, f2: BIGINT>,
+ s2 ARRAY <STRUCT <f1: INT, f2: TIMESTAMP>>,
+ s3 MAP <BIGINT, STRUCT <name: STRING, birthday: TIMESTAMP>>
+)
+STORED AS PARQUET;
+]]>
+</codeblock>
+
+ </conbody>
+
+ </concept>
+
+ <concept id="complex_types_queries">
+
+ <title>Queries and Complex Types</title>
+
+ <conbody>
+
+<!-- Hive does the JSON output business: http://www.datascience-labs.com/hive/hiveql-data-manipulation/ -->
+
+<!-- SELECT * works but skips any nested type coloumns. -->
+
+ <p>
+ The result set of an Impala query always contains all scalar types; the elements and fields within any complex type queries must
+ be <q>unpacked</q> using join queries. A query cannot directly retrieve the entire value for a complex type column. Impala
+ returns an error in this case. Queries using <codeph>SELECT *</codeph> are allowed for tables with complex types, but the
+ columns with complex types are skipped.
+ </p>
+
+ <p>
+ The following example shows how referring directly to a complex type column returns an error, while <codeph>SELECT *</codeph> on
+ the same table succeeds, but only retrieves the scalar columns.
+ </p>
+
+ <note conref="../shared/impala_common.xml#common/complex_type_schema_pointer"/>
+
+<!-- Original error message:
+ERROR: AnalysisException: Expr 'c_orders' in select list returns a complex type 'ARRAY<STRUCT<o_orderkey:BIGINT,o_orderstatus:STRING,o_totalprice:DECIMAL(12,2),o_orderdate:STRING,o_orderpriority:STRING,o_clerk:STRING,o_shippriority:INT,o_comment:STRING,o_lineitems:ARRAY<STRUCT<l_partkey:BIGINT,l_suppkey:BIGINT,l_linenumber:INT,l_quantity:DECIMAL(12,2),l_extendedprice:DECIMAL(12,2),l_discount:DECIMAL(12,2),l_tax:DECIMAL(12,2),l_returnflag:STRING,l_linestatus:STRING,l_shipdate:STRING,l_commitdate:STRING,l_receiptdate:STRING,l_shipinstruct:STRING,l_shipmode:STRING,l_comment:STRING>>>>'.
+-->
+
+<codeblock><![CDATA[SELECT c_orders FROM customer LIMIT 1;
+ERROR: AnalysisException: Expr 'c_orders' in select list returns a complex type 'ARRAY<STRUCT<o_orderkey:BIGINT,o_orderstatus:STRING, ... l_receiptdate:STRING,l_shipinstruct:STRING,l_shipmode:STRING,l_comment:STRING>>>>'.
+Only scalar types are allowed in the select list.
+
+-- Original column has several scalar and one complex column.
+DESCRIBE customer;
++--------------+------------------------------------+
+| name | type |
++--------------+------------------------------------+
+| c_custkey | bigint |
+| c_name | string |
+...
+| c_orders | array<struct< |
+| | o_orderkey:bigint, |
+| | o_orderstatus:string, |
+| | o_totalprice:decimal(12,2), |
+...
+| | >> |
++--------------+------------------------------------+
+
+-- When we SELECT * from that table, only the scalar columns come back in the result set.
+CREATE TABLE select_star_customer STORED AS PARQUET AS SELECT * FROM customer;
++------------------------+
+| summary |
++------------------------+
+| Inserted 150000 row(s) |
++------------------------+
+
+-- The c_orders column, being of complex type, was not included in the SELECT * result set.
+DESC select_star_customer;
++--------------+---------------+
+| name | type |
++--------------+---------------+
+| c_custkey | bigint |
+| c_name | string |
+| c_address | string |
+| c_nationkey | smallint |
+| c_phone | string |
+| c_acctbal | decimal(12,2) |
+| c_mktsegment | string |
+| c_comment | string |
++--------------+---------------+
+]]>
+</codeblock>
+
+<!-- To do: These "references to..." bits could be promoted to their own 'expressions' subheads. -->
+
+ <p>
+ References to fields within <codeph>STRUCT</codeph> columns use dot notation. If the field name is unambiguous, you can omit
+ qualifiers such as table name, column name, or even the <codeph>ITEM</codeph> or <codeph>VALUE</codeph> pseudocolumn names for
+ <codeph>STRUCT</codeph> elements inside an <codeph>ARRAY</codeph> or a <codeph>MAP</codeph>.
+ </p>
+
+<!-- To do: rewrite example to use CUSTOMER table. -->
+
+<!-- Don't think TPC-H schema has a bare STRUCT to use in such a simple query though. -->
+
+<!-- Perhaps reuse the STRUCT_DEMO example here. -->
+
+<codeblock>SELECT id, address.city FROM customers WHERE address.zip = 94305;
+</codeblock>
+
+ <p>
+ References to elements within <codeph>ARRAY</codeph> columns use the <codeph>ITEM</codeph> pseudocolumn:
+ </p>
+
+<!-- To do: shorten qualified names. -->
+
+<codeblock>select r_name, r_nations.item.n_name from region, region.r_nations limit 7;
++--------+----------------+
+| r_name | item.n_name |
++--------+----------------+
+| EUROPE | UNITED KINGDOM |
+| EUROPE | RUSSIA |
+| EUROPE | ROMANIA |
+| EUROPE | GERMANY |
+| EUROPE | FRANCE |
+| ASIA | VIETNAM |
+| ASIA | CHINA |
++--------+----------------+
+</codeblock>
+
+ <p>
+ References to fields within <codeph>MAP</codeph> columns use the <codeph>KEY</codeph> and <codeph>VALUE</codeph> pseudocolumns.
+ In this example, once the query establishes the alias <codeph>MAP_FIELD</codeph> for a <codeph>MAP</codeph> column with a
+ <codeph>STRING</codeph> key and an <codeph>INT</codeph> value, the query can refer to <codeph>MAP_FIELD.KEY</codeph> and
+ <codeph>MAP_FIELD.VALUE</codeph>, which have zero, one, or many instances for each row from the containing table.
+ </p>
+
+<codeblock><![CDATA[DESCRIBE table_0;
++---------+-----------------------+
+| name | type |
++---------+-----------------------+
+| field_0 | string |
+| field_1 | map<string,int> |
+...
+
+SELECT field_0, map_field.key, map_field.value
+ FROM table_0, table_0.field_1 AS map_field
+WHERE length(field_0) = 1
+LIMIT 10;
++---------+-----------+-------+
+| field_0 | key | value |
++---------+-----------+-------+
+| b | gshsgkvd | NULL |
+| b | twrtcxj6 | 18 |
+| b | 2vp5 | 39 |
+| b | fh0s | 13 |
+| v | 2 | 41 |
+| v | 8b58mz | 20 |
+| v | hw | 16 |
+| v | 65l388pyt | 29 |
+| v | 03k68g91z | 30 |
+| v | r2hlg5b | NULL |
++---------+-----------+-------+
+]]>
+</codeblock>
+
+<!-- To do: refer to or reuse examples from the other subtopics that discuss pseudocolumns etc. -->
+
+ <p>
+ When complex types are nested inside each other, you use a combination of joins, pseudocolumn names, and dot notation to refer
+ to specific fields at the appropriate level. This is the most frequent form of query syntax for complex columns, because the
+ typical use case involves two levels of complex types, such as an <codeph>ARRAY</codeph> of <codeph>STRUCT</codeph> elements.
+ </p>
+
+<!-- To do: rewrite example to use CUSTOMER table. -->
+
+<!-- This is my own manufactured example so I have the table, and the query works, but I don't have sample data to show. -->
+
+<codeblock>SELECT id, phone_numbers.area_code FROM contact_info_many_structs INNER JOIN contact_info_many_structs.phone_numbers phone_numbers LIMIT 3;
+</codeblock>
+
+ <p>
+ You can express relationships between <codeph>ARRAY</codeph> and <codeph>MAP</codeph> columns at different levels as joins. You
+ include comparison operators between fields at the top level and within the nested type columns so that Impala can do the
+ appropriate join operation.
+ </p>
+
+<!-- Don't think TPC-H schema has any instances where outer field matches up with inner one though. -->
+
+<!-- But don't think this usage is important enough to call out at this early point. Hide the example for now. -->
+
+<!--
+<codeblock>SELECT o.txn_id FROM customers c, c.orders o WHERE o.cc = c.preferred_cc;
+SELECT c.id, o.txn_id FROM customers c, c.orders o;
+</codeblock>
+-->
+
+<!-- To do: move these examples down, to the examples subtopic at the end. -->
+
+ <note conref="../shared/impala_common.xml#common/complex_type_schema_pointer"/>
+
+ <p>
+ For example, the following queries work equivalently. They each return customer and order data for customers that have at least
+ one order.
+ </p>
+
+<codeblock>SELECT c.c_name, o.o_orderkey FROM customer c, c.c_orders o LIMIT 5;
++--------------------+------------+
+| c_name | o_orderkey |
++--------------------+------------+
+| Customer#000072578 | 558821 |
+| Customer#000072578 | 2079810 |
+| Customer#000072578 | 5768068 |
+| Customer#000072578 | 1805604 |
+| Customer#000072578 | 3436389 |
++--------------------+------------+
+
+SELECT c.c_name, o.o_orderkey FROM customer c INNER JOIN c.c_orders o LIMIT 5;
++--------------------+------------+
+| c_name | o_orderkey |
++--------------------+------------+
+| Customer#000072578 | 558821 |
+| Customer#000072578 | 2079810 |
+| Customer#000072578 | 5768068 |
+| Customer#000072578 | 1805604 |
+| Customer#000072578 | 3436389 |
++--------------------+------------+
+</codeblock>
+
+ <p>
+ The following query using an outer join returns customers that have orders, plus customers with no orders (no entries in the
+ <codeph>C_ORDERS</codeph> array):
+ </p>
+
+<codeblock><![CDATA[SELECT c.c_custkey, o.o_orderkey
+ FROM customer c LEFT OUTER JOIN c.c_orders o
+LIMIT 5;
++-----------+------------+
+| c_custkey | o_orderkey |
++-----------+------------+
+| 60210 | NULL |
+| 147873 | NULL |
+| 72578 | 558821 |
+| 72578 | 2079810 |
+| 72578 | 5768068 |
++-----------+------------+
+]]>
+</codeblock>
+
+ <p>
+ The following query returns <i>only</i> customers that have no orders. (With <codeph>LEFT ANTI JOIN</codeph> or <codeph>LEFT
+ SEMI JOIN</codeph>, the query can only refer to columns from the left-hand table, because by definition there is no matching
+ information in the right-hand table.)
+ </p>
+
+<codeblock><![CDATA[SELECT c.c_custkey, c.c_name
+ FROM customer c LEFT ANTI JOIN c.c_orders o
+LIMIT 5;
++-----------+--------------------+
+| c_custkey | c_name |
++-----------+--------------------+
+| 60210 | Customer#000060210 |
+| 147873 | Customer#000147873 |
+| 141576 | Customer#000141576 |
+| 85365 | Customer#000085365 |
+| 70998 | Customer#000070998 |
++-----------+--------------------+
+]]>
+</codeblock>
+
+<!-- To do: promote the correlated subquery aspect into its own subtopic. -->
+
+ <p>
+ You can also perform correlated subqueries to examine the properties of complex type columns for each row in the result set.
+ </p>
+
+ <p>
+ Count the number of orders per customer. Note the correlated reference to the table alias <codeph>C</codeph>. The
+ <codeph>COUNT(*)</codeph> operation applies to all the elements of the <codeph>C_ORDERS</codeph> array for the corresponding
+ row, avoiding the need for a <codeph>GROUP BY</codeph> clause.
+ </p>
+
+<codeblock>select c_name, howmany FROM customer c, (SELECT COUNT(*) howmany FROM c.c_orders) v limit 5;
++--------------------+---------+
+| c_name | howmany |
++--------------------+---------+
+| Customer#000030065 | 15 |
+| Customer#000065455 | 18 |
+| Customer#000113644 | 21 |
+| Customer#000111078 | 0 |
+| Customer#000024621 | 0 |
++--------------------+---------+
+</codeblock>
+
+ <p>
+ Count the number of orders per customer, ignoring any customers that have not placed any orders:
+ </p>
+
+<codeblock>SELECT c_name, howmany_orders
+FROM
+ customer c,
+ (SELECT COUNT(*) howmany_orders FROM c.c_orders) subq1
+WHERE howmany_orders > 0
+LIMIT 5;
++--------------------+----------------+
+| c_name | howmany_orders |
++--------------------+----------------+
+| Customer#000072578 | 7 |
+| Customer#000046378 | 26 |
+| Customer#000069815 | 11 |
+| Customer#000079058 | 12 |
+| Customer#000092239 | 26 |
++--------------------+----------------+
+</codeblock>
+
+ <p>
+ Count the number of line items in each order. The reference to <codeph>C.C_ORDERS</codeph> in the <codeph>FROM</codeph> clause
+ is needed because the <codeph>O_ORDERKEY</codeph> field is a member of the elements in the <codeph>C_ORDERS</codeph> array. The
+ subquery labelled <codeph>SUBQ1</codeph> is correlated: it is re-evaluated for the <codeph>C_ORDERS.O_LINEITEMS</codeph> array
+ from each row of the <codeph>CUSTOMERS</codeph> table.
+ </p>
+
+<codeblock>SELECT c_name, o_orderkey, howmany_line_items
+FROM
+ customer c,
+ c.c_orders t2,
+ (SELECT COUNT(*) howmany_line_items FROM c.c_orders.o_lineitems) subq1
+WHERE howmany_line_items > 0
+LIMIT 5;
++--------------------+------------+--------------------+
+| c_name | o_orderkey | howmany_line_items |
++--------------------+------------+--------------------+
+| Customer#000020890 | 1884930 | 95 |
+| Customer#000020890 | 4570754 | 95 |
+| Customer#000020890 | 3771072 | 95 |
+| Customer#000020890 | 2555489 | 95 |
+| Customer#000020890 | 919171 | 95 |
++--------------------+------------+--------------------+
+</codeblock>
+
+ <p>
+ Get the number of orders, the average order price, and the maximum items in any order per customer. For this example, the
+ subqueries labelled <codeph>SUBQ1</codeph> and <codeph>SUBQ2</codeph> are correlated: they are re-evaluated for each row from
+ the original <codeph>CUSTOMER</codeph> table, and only apply to the complex columns associated with that row.
+ </p>
+
+<codeblock>SELECT c_name, howmany, average_price, most_items
+FROM
+ customer c,
+ (SELECT COUNT(*) howmany, AVG(o_totalprice) average_price FROM c.c_orders) subq1,
+ (SELECT MAX(l_quantity) most_items FROM c.c_orders.o_lineitems ) subq2
+LIMIT 5;
++--------------------+---------+---------------+------------+
+| c_name | howmany | average_price | most_items |
++--------------------+---------+---------------+------------+
+| Customer#000030065 | 15 | 128908.34 | 50.00 |
+| Customer#000088191 | 0 | NULL | NULL |
+| Customer#000101555 | 10 | 164250.31 | 50.00 |
+| Customer#000022092 | 0 | NULL | NULL |
+| Customer#000036277 | 27 | 166040.06 | 50.00 |
++--------------------+---------+---------------+------------+
+</codeblock>
+
+ <p>
+ For example, these queries show how to access information about the <codeph>ARRAY</codeph> elements within the
+ <codeph>CUSTOMER</codeph> table from the <q>nested TPC-H</q> schema, starting with the initial <codeph>ARRAY</codeph> elements
+ and progressing to examine the <codeph>STRUCT</codeph> fields of the <codeph>ARRAY</codeph>, and then the elements nested within
+ another <codeph>ARRAY</codeph> of <codeph>STRUCT</codeph>:
+ </p>
+
+<codeblock><![CDATA[-- How many orders does each customer have?
+-- The type of the ARRAY column doesn't matter, this is just counting the elements.
+SELECT c_custkey, count(*)
+ FROM customer, customer.c_orders
+GROUP BY c_custkey
+LIMIT 5;
++-----------+----------+
+| c_custkey | count(*) |
++-----------+----------+
+| 61081 | 21 |
+| 115987 | 15 |
+| 69685 | 19 |
+| 109124 | 15 |
+| 50491 | 12 |
++-----------+----------+
+
+-- How many line items are part of each customer order?
+-- Now we examine a field from a STRUCT nested inside the ARRAY.
+SELECT c_custkey, c_orders.o_orderkey, count(*)
+ FROM customer, customer.c_orders c_orders, c_orders.o_lineitems
+GROUP BY c_custkey, c_orders.o_orderkey
+LIMIT 5;
++-----------+------------+----------+
+| c_custkey | o_orderkey | count(*) |
++-----------+------------+----------+
+| 63367 | 4985959 | 7 |
+| 53989 | 1972230 | 2 |
+| 143513 | 5750498 | 5 |
+| 17849 | 4857989 | 1 |
+| 89881 | 1046437 | 1 |
++-----------+------------+----------+
+
+-- What are the line items in each customer order?
+-- One of the STRUCT fields inside the ARRAY is another
+-- ARRAY containing STRUCT elements. The join finds
+-- all the related items from both levels of ARRAY.
+SELECT c_custkey, o_orderkey, l_partkey
+ FROM customer, customer.c_orders, c_orders.o_lineitems
+LIMIT 5;
++-----------+------------+-----------+
+| c_custkey | o_orderkey | l_partkey |
++-----------+------------+-----------+
+| 113644 | 2738497 | 175846 |
+| 113644 | 2738497 | 27309 |
+| 113644 | 2738497 | 175873 |
+| 113644 | 2738497 | 88559 |
+| 113644 | 2738497 | 8032 |
++-----------+------------+-----------+
+]]>
+</codeblock>
+
+ </conbody>
+
+ </concept>
+
+ </concept>
+
+ <concept id="pseudocolumns">
+
+ <title>Pseudocolumns for ARRAY and MAP Types</title>
+
+ <conbody>
+
+ <p>
+ Each element in an <codeph>ARRAY</codeph> type has a position, indexed starting from zero, and a value. Each element in a
+ <codeph>MAP</codeph> type represents a key-value pair. Impala provides pseudocolumns that let you retrieve this metadata as part
+ of a query, or filter query results by including such things in a <codeph>WHERE</codeph> clause. You refer to the pseudocolumns as
+ part of qualified column names in queries:
+ </p>
+
+ <ul>
+ <li>
+ <codeph>ITEM</codeph>: The value of an array element. If the <codeph>ARRAY</codeph> contains <codeph>STRUCT</codeph> elements,
+ you can refer to either <codeph><varname>array_name</varname>.ITEM.<varname>field_name</varname></codeph> or use the shorthand
+ <codeph><varname>array_name</varname>.<varname>field_name</varname></codeph>.
+ </li>
+
+ <li>
+ <codeph>POS</codeph>: The position of an element within an array.
+ </li>
+
+ <li>
+ <codeph>KEY</codeph>: The value forming the first part of a key-value pair in a map. It is not necessarily unique.
+ </li>
+
+ <li>
+ <codeph>VALUE</codeph>: The data item forming the second part of a key-value pair in a map. If the <codeph>VALUE</codeph> part
+ of the <codeph>MAP</codeph> element is a <codeph>STRUCT</codeph>, you can refer to either
+ <codeph><varname>map_name</varname>.VALUE.<varname>field_name</varname></codeph> or use the shorthand
+ <codeph><varname>map_name</varname>.<varname>field_name</varname></codeph>.
+ </li>
+ </ul>
+
+<!-- To do: Consider whether to move the detailed subtopics underneath ARRAY and MAP instead of embedded here. -->
+
+ <p outputclass="toc inpage"/>
+
+ </conbody>
+
+ <concept id="item">
+
+ <title id="pos">ITEM and POS Pseudocolumns</title>
+
+ <conbody>
+
+ <p>
+ When an <codeph>ARRAY</codeph> column contains <codeph>STRUCT</codeph> elements, you can refer to a field within the
+ <codeph>STRUCT</codeph> using a qualified name of the form
+ <codeph><varname>array_column</varname>.<varname>field_name</varname></codeph>. If the <codeph>ARRAY</codeph> contains scalar
+ values, Impala recognizes the special name <codeph><varname>array_column</varname>.ITEM</codeph> to represent the value of each
+ scalar array element. For example, if a column contained an <codeph>ARRAY</codeph> where each element was a
+ <codeph>STRING</codeph>, you would use <codeph><varname>array_name</varname>.ITEM</codeph> to refer to each scalar value in the
+ <codeph>SELECT</codeph> list, or the <codeph>WHERE</codeph> or other clauses.
+ </p>
+
+ <p>
+ This example shows a table with two <codeph>ARRAY</codeph> columns whose elements are of the scalar type
+ <codeph>STRING</codeph>. When referring to the values of the array elements in the <codeph>SELECT</codeph> list,
+ <codeph>WHERE</codeph> clause, or <codeph>ORDER BY</codeph> clause, you use the <codeph>ITEM</codeph> pseudocolumn because
+ within the array, the individual elements have no defined names.
+ </p>
+
+<codeblock><![CDATA[create TABLE persons_of_interest
+(
+person_id BIGINT,
+aliases ARRAY <STRING>,
+associates ARRAY <STRING>,
+real_name STRING
+)
+STORED AS PARQUET;
+
+-- Get all the aliases of each person.
+SELECT real_name, aliases.ITEM
+ FROM persons_of_interest, persons_of_interest.aliases
+ORDER BY real_name, aliases.item;
+
+-- Search for particular associates of each person.
+SELECT real_name, associates.ITEM
+ FROM persons_of_interest, persons_of_interest.associates
+WHERE associates.item LIKE '% MacGuffin';
+]]>
+</codeblock>
+
+ <p>
+ Because an array is inherently an ordered data structure, Impala recognizes the special name
+ <codeph><varname>array_column</varname>.POS</codeph> to represent the numeric position of each element within the array. The
+ <codeph>POS</codeph> pseudocolumn lets you filter or reorder the result set based on the sequence of array elements.
+ </p>
+
+ <p>
+ The following example uses a table from a flattened version of the TPC-H schema. The <codeph>REGION</codeph> table only has a
+ few rows, such as one row for Europe and one for Asia. The row for each region represents all the countries in that region as an
+ <codeph>ARRAY</codeph> of <codeph>STRUCT</codeph> elements:
+ </p>
+
+<codeblock><![CDATA[[localhost:21000] > desc region;
++-------------+--------------------------------------------------------------------+
+| name | type |
++-------------+--------------------------------------------------------------------+
+| r_regionkey | smallint |
+| r_name | string |
+| r_comment | string |
+| r_nations | array<struct<n_nationkey:smallint,n_name:string,n_comment:string>> |
++-------------+--------------------------------------------------------------------+
+]]>
+</codeblock>
+
+ <p>
+ To find the countries within a specific region, you use a join query. To find out the order of elements in the array, you also
+ refer to the <codeph>POS</codeph> pseudocolumn in the select list:
+ </p>
+
+<codeblock>[localhost:21000] > SELECT r1.r_name, r2.n_name, <b>r2.POS</b>
+ > FROM region r1 INNER JOIN r1.r_nations r2
+ > WHERE r1.r_name = 'ASIA';
++--------+-----------+-----+
+| r_name | n_name | pos |
++--------+-----------+-----+
+| ASIA | VIETNAM | 0 |
+| ASIA | CHINA | 1 |
+| ASIA | JAPAN | 2 |
+| ASIA | INDONESIA | 3 |
+| ASIA | INDIA | 4 |
++--------+-----------+-----+
+</codeblock>
+
+ <p>
+ Once you know the positions of the elements, you can use that information in subsequent queries, for example to change the
+ ordering of results from the complex type column or to filter certain elements from the array:
+ </p>
+
+<codeblock>[localhost:21000] > SELECT r1.r_name, r2.n_name, r2.POS
+ > FROM region r1 INNER JOIN r1.r_nations r2
+ > WHERE r1.r_name = 'ASIA'
+ > <b>ORDER BY r2.POS DESC</b>;
++--------+-----------+-----+
+| r_name | n_name | pos |
++--------+-----------+-----+
+| ASIA | INDIA | 4 |
+| ASIA | INDONESIA | 3 |
+| ASIA | JAPAN | 2 |
+| ASIA | CHINA | 1 |
+| ASIA | VIETNAM | 0 |
++--------+-----------+-----+
+[localhost:21000] > SELECT r1.r_name, r2.n_name, r2.POS
+ > FROM region r1 INNER JOIN r1.r_nations r2
+ > WHERE r1.r_name = 'ASIA' AND <b>r2.POS BETWEEN 1 and 3</b>;
++--------+-----------+-----+
+| r_name | n_name | pos |
++--------+-----------+-----+
+| ASIA | CHINA | 1 |
+| ASIA | JAPAN | 2 |
+| ASIA | INDONESIA | 3 |
++--------+-----------+-----+
+</codeblock>
+
+ </conbody>
+
+ </concept>
+
+ <concept id="key">
+
+ <title id="value">KEY and VALUE Pseudocolumns</title>
+
+ <conbody>
+
+ <p>
+ The <codeph>MAP</codeph> data type is suitable for representing sparse or wide data structures, where each row might only have
+ entries for a small subset of named fields. Because the element names (the map keys) vary depending on the row, a query must be
+ able to refer to both the key and the value parts of each key-value pair. The <codeph>KEY</codeph> and <codeph>VALUE</codeph>
+ pseudocolumns let you refer to the parts of the key-value pair independently within the query, as
+ <codeph><varname>map_column</varname>.KEY</codeph> and <codeph><varname>map_column</varname>.VALUE</codeph>.
+ </p>
+
+ <p>
+ The <codeph>KEY</codeph> must always be a scalar type, such as <codeph>STRING</codeph>, <codeph>BIGINT</codeph>, or
+ <codeph>TIMESTAMP</codeph>. It can be <codeph>NULL</codeph>. Values of the <codeph>KEY</codeph> field are not necessarily unique
+ within the same <codeph>MAP</codeph>. You apply any required <codeph>DISTINCT</codeph>, <codeph>GROUP BY</codeph>, and other
+ clauses in the query, and loop through the result set to process all the values matching any specified keys.
+ </p>
+
+ <p>
+ The <codeph>VALUE</codeph> can be either a scalar type or another complex type. If the <codeph>VALUE</codeph> is a
+ <codeph>STRUCT</codeph>, you can construct a qualified name
+ <codeph><varname>map_column</varname>.VALUE.<varname>struct_field</varname></codeph> to refer to the individual fields inside
+ the value part. If the <codeph>VALUE</codeph> is an <codeph>ARRAY</codeph> or another <codeph>MAP</codeph>, you must include
+ another join condition that establishes a table alias for <codeph><varname>map_column</varname>.VALUE</codeph>, and then
+ construct another qualified name using that alias, for example <codeph><varname>table_alias</varname>.ITEM</codeph> or
+ <codeph><varname>table_alias</varname>.KEY</codeph> and <codeph><varname>table_alias</varname>.VALUE</codeph>
+ </p>
+
+ <p>
+ The following example shows different ways to access a <codeph>MAP</codeph> column using the <codeph>KEY</codeph> and
+ <codeph>VALUE</codeph> pseudocolumns. The <codeph>DETAILS</codeph> column has a <codeph>STRING</codeph> first part with short,
+ standardized values such as <codeph>'Recurring'</codeph>, <codeph>'Lucid'</codeph>, or <codeph>'Anxiety'</codeph>. This is the
+ <q>key</q> that is used to look up particular kinds of elements from the <codeph>MAP</codeph>. The second part, also a
+ <codeph>STRING</codeph>, is a longer free-form explanation. Impala gives you the standard pseudocolumn names
+ <codeph>KEY</codeph> and <codeph>VALUE</codeph> for the two parts, and you apply your own conventions and interpretations to the
+ underlying values.
+ </p>
+
+ <note>
+ If you find that the single-item nature of the <codeph>VALUE</codeph> makes it difficult to model your data accurately, the
+ solution is typically to add some nesting to the complex type. For example, to have several sets of key-value pairs, make the
+ column an <codeph>ARRAY</codeph> whose elements are <codeph>MAP</codeph>. To make a set of key-value pairs that holds more
+ elaborate information, make a <codeph>MAP</codeph> column whose <codeph>VALUE</codeph> part contains an <codeph>ARRAY</codeph>
+ or a <codeph>STRUCT</codeph>.
+ </note>
+
+<codeblock><![CDATA[CREATE TABLE dream_journal
+(
+ dream_id BIGINT,
+ details MAP <STRING,STRING>
+)
+STORED AS PARQUET;
+]]>
+
+-- What are all the types of dreams that are recorded?
+SELECT DISTINCT details.KEY FROM dream_journal, dream_journal.details;
+
+-- How many lucid dreams were recorded?
+-- Because there is no GROUP BY, we count the 'Lucid' keys across all rows.
+SELECT <b>COUNT(details.KEY)</b>
+ FROM dream_journal, dream_journal.details
+WHERE <b>details.KEY = 'Lucid'</b>;
+
+-- Print a report of a subset of dreams, filtering based on both the lookup key
+-- and the detailed value.
+SELECT dream_id, <b>details.KEY AS "Dream Type"</b>, <b>details.VALUE AS "Dream Summary"</b>
+ FROM dream_journal, dream_journal.details
+WHERE
+ <b>details.KEY IN ('Happy', 'Pleasant', 'Joyous')</b>
+ AND <b>details.VALUE LIKE '%childhood%'</b>;
+</codeblock>
+
+ <p>
+ The following example shows a more elaborate version of the previous table, where the <codeph>VALUE</codeph> part of the
+ <codeph>MAP</codeph> entry is a <codeph>STRUCT</codeph> rather than a scalar type. Now instead of referring to the
+ <codeph>VALUE</codeph> pseudocolumn directly, you use dot notation to refer to the <codeph>STRUCT</codeph> fields inside it.
+ </p>
+
+<codeblock><![CDATA[CREATE TABLE better_dream_journal
+(
+ dream_id BIGINT,
+ details MAP <STRING,STRUCT <summary: STRING, when_happened: TIMESTAMP, duration: DECIMAL(5,2), woke_up: BOOLEAN> >
+)
+STORED AS PARQUET;
+]]>
+
+-- Do more elaborate reporting and filtering by examining multiple attributes within the same dream.
+SELECT dream_id, <b>details.KEY AS "Dream Type"</b>, <b>details.VALUE.summary AS "Dream Summary"</b>, <b>details.VALUE.duration AS "Duration"</b>
+ FROM better_dream_journal, better_dream_journal.details
+WHERE
+ <b>details.KEY IN ('Anxiety', 'Nightmare')</b>
+ AND <b>details.VALUE.duration > 60</b>
+ AND <b>details.VALUE.woke_up = TRUE</b>;
+
+-- Remember that if the ITEM or VALUE contains a STRUCT, you can reference
+-- the STRUCT fields directly without the .ITEM or .VALUE qualifier.
+SELECT dream_id, <b>details.KEY AS "Dream Type"</b>, <b>details.summary AS "Dream Summary"</b>, <b>details.duration AS "Duration"</b>
+ FROM better_dream_journal, better_dream_journal.details
+WHERE
+ <b>details.KEY IN ('Anxiety', 'Nightmare')</b>
+ AND <b>details.duration > 60</b>
+ AND <b>details.woke_up = TRUE</b>;
+</codeblock>
+
+ </conbody>
+
+ </concept>
+
+ </concept>
+
+ <concept id="complex_types_etl">
+
+<!-- This topic overlaps in many ways with the preceding one. See which theme resonates with users, and combine them under the better title. -->
+
+ <title>Loading Data Containing Complex Types</title>
+
+ <conbody>
+
+ <p>
+ Because the Impala <codeph>INSERT</codeph> statement does not currently support creating new data with complex type columns, or
+ copying existing complex type values from one table to another, you primarily use Impala to query Parquet tables with complex
+ types where the data was inserted through Hive, or create tables with complex types where you already have existing Parquet data
+ files.
+ </p>
+
+ <p>
+ If you have created a Hive table with the Parquet file format and containing complex types, use the same table for Impala queries
+ with no changes. If you have such a Hive table in some other format, use a Hive <codeph>CREATE TABLE AS SELECT ... STORED AS
+ PARQUET</codeph> or <codeph>INSERT ... SELECT</codeph> statement to produce an equivalent Parquet table that Impala can query.
+ </p>
+
+ <p>
+ If you have existing Parquet data files containing complex types, located outside of any Impala or Hive table, such as data files
+ created by Spark jobs, you can use an Impala <codeph>CREATE TABLE ... STORED AS PARQUET</codeph> statement, followed by an Impala
+ <codeph>LOAD DATA</codeph> statement to move the data files into the table. As an alternative, you can use an Impala
+ <codeph>CREATE EXTERNAL TABLE</codeph> statement to create a table pointing to the HDFS directory that already contains the data
+ files.
+ </p>
+
+ <p>
+ Perhaps the simplest way to get started with complex type data is to take a denormalized table containing duplicated values, and
+ use an <codeph>INSERT ... SELECT</codeph> statement to copy the data into a Parquet table and condense the repeated values into
+ complex types. With the Hive <codeph>INSERT</codeph> statement, you use the <codeph>COLLECT_LIST()</codeph>,
+ <codeph>NAMED_STRUCT()</codeph>, and <codeph>MAP()</codeph> constructor functions within a <codeph>GROUP BY</codeph> query to
+ produce the complex type values. <codeph>COLLECT_LIST()</codeph> turns a sequence of values into an <codeph>ARRAY</codeph>.
+ <codeph>NAMED_STRUCT()</codeph> uses the first, third, and so on arguments as the field names for a <codeph>STRUCT</codeph>, to
+ match the field names from the <codeph>CREATE TABLE</codeph> statement.
+ </p>
+
+ <note>
+ Because Hive currently cannot construct individual rows using complex types through the <codeph>INSERT ... VALUES</codeph> syntax,
+ you prepare the data in flat form in a separate table, then copy it to the table with complex columns using <codeph>INSERT ...
+ SELECT</codeph> and the complex type constructors. See <xref href="impala_complex_types.xml#complex_types_ex_hive_etl"/> for
+ examples.
+ </note>
+
+ </conbody>
+
+ </concept>
+
+ <concept id="complex_types_nesting">
+
+ <title>Using Complex Types as Nested Types</title>
+
+ <conbody>
+
+ <p>
+ The <codeph>ARRAY</codeph>, <codeph>STRUCT</codeph>, and <codeph>MAP</codeph> types can be the top-level types for <q>nested
+ type</q> columns. That is, each of these types can contain other complex or scalar types, with multiple levels of nesting to a
+ maximum depth of 100. For example, you can have an array of structures, a map containing other maps, a structure containing an
+ array of other structures, and so on. At the lowest level, there are always scalar types making up the fields of a
+ <codeph>STRUCT</codeph>, elements of an <codeph>ARRAY</codeph>, and keys and values of a <codeph>MAP</codeph>.
+ </p>
+
+ <p>
+ Schemas involving complex types typically use some level of nesting for the complex type columns.
+ </p>
+
+ <p>
+ For example, to model a relationship like a dimension table and a fact table, you typically use an <codeph>ARRAY</codeph> where
+ each array element is a <codeph>STRUCT</codeph>. The <codeph>STRUCT</codeph> fields represent what would traditionally be columns
+ in a separate joined table. It makes little sense to use a <codeph>STRUCT</codeph> as the top-level type for a column, because you
+ could just make the fields of the <codeph>STRUCT</codeph> into regular table columns.
+ </p>
+
+<!-- To do: this example might move somewhere else, under STRUCT itself or in a tips-and-tricks section. -->
+
+ <p>
+ Perhaps the only use case for a top-level <codeph>STRUCT</codeph> would be to to allow <codeph>STRUCT</codeph> fields with the
+ same name as columns to coexist in the same table. The following example shows how a table could have a column named
+ <codeph>ID</codeph>, and two separate <codeph>STRUCT</codeph> fields also named <codeph>ID</codeph>. Because the
+ <codeph>STRUCT</codeph> fields are always referenced using qualified names, the identical <codeph>ID</codeph> names do not cause a
+ conflict.
+ </p>
+
+<codeblock><![CDATA[CREATE TABLE struct_namespaces
+(
+ id BIGINT
+ , s1 STRUCT < id: BIGINT, field1: STRING >
+ , s2 STRUCT < id: BIGINT, when_happened: TIMESTAMP >
+)
+STORED AS PARQUET;
+
+select id, s1.id, s2.id from struct_namespaces;
+]]>
+</codeblock>
+
+ <p>
+ It is common to make the value portion of each key-value pair in a <codeph>MAP</codeph> a <codeph>STRUCT</codeph>,
+ <codeph>ARRAY</codeph> of <codeph>STRUCT</codeph>, or other complex type variation. That way, each key in the <codeph>MAP</codeph>
+ can be associated with a flexible and extensible data structure. The key values are not predefined ahead of time (other than by
+ specifying their data type). Therefore, the <codeph>MAP</codeph> can accomodate a rapidly evolving schema, or sparse data
+ structures where each row contains only a few data values drawn from a large set of possible choices.
+ </p>
+
+ <p>
+ Although you can use an <codeph>ARRAY</codeph> of scalar values as the top-level column in a table, such a simple array is
+ typically of limited use for analytic queries. The only property of the array elements, aside from the element value, is the
+ ordering sequence available through the <codeph>POS</codeph> pseudocolumn. To record any additional item about each array element,
+ such as a <codeph>TIMESTAMP</codeph> or a symbolic name, you use an <codeph>ARRAY</codeph> of <codeph>STRUCT</codeph> rather than
+ of scalar values.
+ </p>
+
+ <p>
+ If you are considering having multiple <codeph>ARRAY</codeph> or <codeph>MAP</codeph> columns, with related items under the same
+ position in each <codeph>ARRAY</codeph> or the same key in each <codeph>MAP</codeph>, prefer to use a <codeph>STRUCT</codeph> to
+ group all the related items into a single <codeph>ARRAY</codeph> or <codeph>MAP</codeph>. Doing so avoids the additional storage
+ overhead and potential duplication of key values from having an extra complex type column. Also, because each
+ <codeph>ARRAY</codeph> or <codeph>MAP</codeph> that you reference in the query <codeph>SELECT</codeph> list requires an additional
+ join clause, minimizing the number of complex type columns also makes the query easier to read and maintain, relying more on dot
+ notation to refer to the relevant fields rather than a sequence of join clauses.
+ </p>
+
+ <p>
+ For example, here is a table with several complex type columns all at the top level and containing only scalar types. To retrieve
+ every data item for the row requires a separate join for each <codeph>ARRAY</codeph> or <codeph>MAP</codeph> column. The fields of
+ the <codeph>STRUCT</codeph> can be referenced using dot notation, but there is no real advantage to using the
+ <codeph>STRUCT</codeph> at the top level rather than just making separate columns <codeph>FIELD1</codeph> and
+ <codeph>FIELD2</codeph>.
+ </p>
+
+<codeblock><![CDATA[CREATE TABLE complex_types_top_level
+(
+ id BIGINT,
+ a1 ARRAY<INT>,
+ a2 ARRAY<STRING>,
+ s STRUCT<field1: INT, field2: STRING>,
+-- Numeric lookup key for a string value.
+ m1 MAP<INT,STRING>,
+-- String lookup key for a numeric value.
+ m2 MAP<STRING,INT>
+)
+STORED AS PARQUET;
+
+describe complex_types_top_level;
++------+-----------------+
+| name | type |
++------+-----------------+
+| id | bigint |
+| a1 | array<int> |
+| a2 | array<string> |
+| s | struct< |
+| | field1:int, |
+| | field2:string |
+| | > |
+| m1 | map<int,string> |
+| m2 | map<string,int> |
++------+-----------------+
+
+select
+ id,
+ a1.item,
+ a2.item,
+ s.field1,
+ s.field2,
+ m1.key,
+ m1.value,
+ m2.key,
+ m2.value
+from
+ complex_types_top_level,
+ complex_types_top_level.a1,
+ complex_types_top_level.a2,
+ complex_types_top_level.m1,
+ complex_types_top_level.m2;
+]]>
+</codeblock>
+
+ <p>
+ For example, here is a table with columns containing an <codeph>ARRAY</codeph> of <codeph>STRUCT</codeph>, a <codeph>MAP</codeph>
+ where each key value is a <codeph>STRUCT</codeph>, and a <codeph>MAP</codeph> where each key value is an <codeph>ARRAY</codeph> of
+ <codeph>STRUCT</codeph>.
+ </p>
+
+<codeblock><![CDATA[CREATE TABLE nesting_demo
+(
+ user_id BIGINT,
+ family_members ARRAY < STRUCT < name: STRING, email: STRING, date_joined: TIMESTAMP >>,
+ foo map < STRING, STRUCT < f1: INT, f2: INT, f3: TIMESTAMP, f4: BOOLEAN >>,
+ gameplay MAP < STRING , ARRAY < STRUCT <
+ name: STRING, highest: BIGINT, lives_used: INT, total_spent: DECIMAL(16,2)
+ >>>
+)
+STORED AS PARQUET;
+]]>
+</codeblock>
+
+ <p>
+ The <codeph>DESCRIBE</codeph> statement rearranges the <codeph><</codeph> and <codeph>></codeph> separators and the field
+ names within each <codeph>STRUCT</codeph> for easy readability:
+ </p>
+
+<codeblock><![CDATA[DESCRIBE nesting_demo;
++----------------+-----------------------------+
+| name | type |
++----------------+-----------------------------+
+| user_id | bigint |
+| family_members | array<struct< |
+| | name:string, |
+| | email:string, |
+| | date_joined:timestamp |
+| | >>
<TRUNCATED>
[05/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Posted by jr...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_show.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_show.xml b/docs/topics/impala_show.xml
new file mode 100644
index 0000000..1e8c17d
--- /dev/null
+++ b/docs/topics/impala_show.xml
@@ -0,0 +1,1263 @@
+<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="show">
+
+ <title>SHOW Statement</title>
+ <titlealts><navtitle>SHOW</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Reports"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">SHOW statement</indexterm>
+ The <codeph>SHOW</codeph> statement is a flexible way to get information about different types of Impala
+ objects.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>SHOW DATABASES [[LIKE] '<varname>pattern</varname>']
+SHOW SCHEMAS [[LIKE] '<varname>pattern</varname>'] - an alias for SHOW DATABASES
+SHOW TABLES [IN <varname>database_name</varname>] [[LIKE] '<varname>pattern</varname>']
+<ph rev="1.2.0">SHOW [AGGREGATE | ANALYTIC] FUNCTIONS [IN <varname>database_name</varname>] [[LIKE] '<varname>pattern</varname>']</ph>
+<ph rev="1.2.1">SHOW CREATE TABLE [<varname>database_name</varname>].<varname>table_name</varname></ph>
+<ph rev="1.2.1">SHOW TABLE STATS [<varname>database_name</varname>.]<varname>table_name</varname></ph>
+<ph rev="1.2.1">SHOW COLUMN STATS [<varname>database_name</varname>.]<varname>table_name</varname></ph>
+<ph rev="1.4.0">SHOW PARTITIONS [<varname>database_name</varname>.]<varname>table_name</varname></ph>
+<ph rev="2.2.0">SHOW FILES IN [<varname>database_name</varname>.]<varname>table_name</varname> [PARTITION (<varname>key_col</varname>=<varname>value</varname> [, <varname>key_col</varname>=<varname>value</varname>]]</ph>
+
+<ph rev="2.0.0">SHOW ROLES
+SHOW CURRENT ROLES
+SHOW ROLE GRANT GROUP <varname>group_name</varname>
+SHOW GRANT ROLE <varname>role_name</varname></ph>
+</codeblock>
+
+<!-- SHOW ROLE GRANT { USER <varname>user_name</varname> | GROUP <varname>group_name</varname> | ROLE <varname>role_name</varname> } -->
+
+<!-- Extracted from the previous codeblock because even hidden content produces blank lines.
+<ph audience="Cloudera" rev="1.4.0">SHOW DATA SOURCES [LIKE '<varname>source_name</varname>]</ph>
+-->
+
+<!-- Some suggestion there would be this syntax for 1.4, but it's not in the builds:
+<ph rev="1.4.0">SHOW [CACHED] TABLES [IN <varname>database_name</varname>] [[LIKE] '<varname>pattern</varname>']</ph>
+<ph rev="1.4.0">SHOW [CACHED] PARTITIONS [<varname>database_name</varname>.]<varname>table_name</varname></ph>
+-->
+
+ <p>
+ Issue a <codeph>SHOW <varname>object_type</varname></codeph> statement to see the appropriate objects in the
+ current database, or <codeph>SHOW <varname>object_type</varname> IN <varname>database_name</varname></codeph>
+ to see objects in a specific database.
+ </p>
+
+ <p>
+ The optional <varname>pattern</varname> argument is a quoted string literal, using Unix-style
+ <codeph>*</codeph> wildcards and allowing <codeph>|</codeph> for alternation. The preceding
+ <codeph>LIKE</codeph> keyword is also optional. All object names are stored in lowercase, so use all
+ lowercase letters in the pattern string. For example:
+ </p>
+
+<codeblock>show databases 'a*';
+show databases like 'a*';
+show tables in some_db like '*fact*';
+use some_db;
+show tables '*dim*|*fact*';</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+ <p outputclass="toc inpage"/>
+
+ </conbody>
+
+ <concept rev="2.2.0" id="show_files">
+
+ <title>SHOW FILES Statement</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Disk Storage"/>
+ <data name="Category" value="Tables"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ The <codeph>SHOW FILES</codeph> statement displays the files that constitute a specified table,
+ or a partition within a partitioned table. This syntax is available in CDH 5.4 and higher
+ only. The output includes the names of the files, the size of each file, and the applicable partition
+ for a partitioned table. The size includes a suffix of <codeph>B</codeph> for bytes,
+ <codeph>MB</codeph> for megabytes, and <codeph>GB</codeph> for gigabytes.
+ </p>
+
+ <note>
+ This statement applies to tables and partitions stored on HDFS, or in the Amazon Simple Storage System (S3).
+ It does not apply to views.
+ It does not apply to tables mapped onto HBase, because HBase does not use the same file-based storage layout.
+ </note>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ You can use this statement to verify the results of your ETL process: that is, that
+ the expected files are present, with the expected sizes. You can examine the file information
+ to detect conditions such as empty files, missing files, or inefficient layouts due to
+ a large number of small files. When you use <codeph>INSERT</codeph> statements to copy
+ from one table to another, you can see how the file layout changes due to file format
+ conversions, compaction of small input files into large data blocks, and
+ multiple output files from parallel queries and partitioned inserts.
+ </p>
+
+ <p>
+ The output from this statement does not include files that Impala considers to be hidden
+ or invisible, such as those whose names start with a dot or an underscore, or that
+ end with the suffixes <codeph>.copying</codeph> or <codeph>.tmp</codeph>.
+ </p>
+
+ <p>
+ The information for partitioned tables complements the output of the <codeph>SHOW PARTITIONS</codeph>
+ statement, which summarizes information about each partition. <codeph>SHOW PARTITIONS</codeph>
+ produces some output for each partition, while <codeph>SHOW FILES</codeph> does not
+ produce any output for empty partitions because they do not include any data files.
+ </p>
+
+<!-- Extensive round of testing makes me pretty confident of these findings. -->
+ <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
+ <p rev="CDH-19187">
+ The user ID that the <cmdname>impalad</cmdname> daemon runs under,
+ typically the <codeph>impala</codeph> user, must have read
+ permission for all the table files, read and execute permission for all the directories that make up the table,
+ and execute permission for the database directory and all its parent directories.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example constructs <codeph>SHOW FILES</codeph> statements
+ for an unpartitioned tables using text format:
+ </p>
+
+<codeblock>[localhost:21000] > create table unpartitioned_text (x bigint, s string);
+[localhost:21000] > insert into unpartitioned_text (x, s) select id, name from oreilly.sample_data limit 20e6;
+[localhost:21000] > show files in unpartitioned_text;
++-------------------------------------------------------------------------------------+----------+-----------+
+| path | size | partition |
++-------------------------------------------------------------------------------------+----------+-----------+
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/unpartitioned_text/35665776ef85cfaf_1012432410_data.0. | 448.31MB | |
++-------------------------------------------------------------------------------------+----------+-----------+
+[localhost:21000] > insert into unpartitioned_text (x, s) select id, name from oreilly.sample_data limit 100e6;
+[localhost:21000] > show files in unpartitioned_text;
++---------------------------------------------------------------------------------------------+----------+-----------+
+| path | size | partition |
++---------------------------------------------------------------------------------------------+----------+-----------+
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/unpartitioned_text/35665776ef85cfaf_1012432410_data.0. | 448.31MB | |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/unpartitioned_text/ac3dba252a8952b8_1663177415_data.0. | 2.19GB | |
++---------------------------------------------------------------------------------------------+----------+-----------+
+</codeblock>
+
+ <p>
+ This example illustrates how, after issuing some <codeph>INSERT ... VALUES</codeph> statements,
+ the table now contains some tiny files of just a few bytes. Such small files could cause inefficient processing of
+ parallel queries that are expecting multi-megabyte input files. The example shows how you might compact the small files by doing
+ an <codeph>INSERT ... SELECT</codeph> into a different table, possibly converting the data to Parquet in the process:
+ </p>
+
+<codeblock>[localhost:21000] > insert into unpartitioned_text values (10,'hello'), (20, 'world');
+[localhost:21000] > insert into unpartitioned_text values (-1,'foo'), (-1000, 'bar');
+[localhost:21000] > show files in unpartitioned_text;
++---------------------------------------------------------------------------------------------+----------+-----------+
+| path | size | partition |
++---------------------------------------------------------------------------------------------+----------+-----------+
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/unpartitioned_text/4f11b8bdf8b6aa92_238145083_data.0. | 18B | |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/unpartitioned_text/35665776ef85cfaf_1012432410_data.0. | 448.31MB | |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/unpartitioned_text/ac3dba252a8952b8_1663177415_data.0. | 2.19GB | |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/unpartitioned_text/cfb8252452445682_1868457216_data.0. | 17B | |
++---------------------------------------------------------------------------------------------+----------+-----------+
+[localhost:21000] > create table unpartitioned_parquet stored as parquet as select * from unpartitioned_text;
++---------------------------+
+| summary |
++---------------------------+
+| Inserted 120000002 row(s) |
++---------------------------+
+[localhost:21000] > show files in unpartitioned_parquet;
++----------------------------------------------------------------------------------------------------+----------+-----------+
+| path | size | partition |
++----------------------------------------------------------------------------------------------------+----------+-----------+
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/unpartitioned_parquet/60798d96ba630184_549959007_data.0.parq | 255.36MB | |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/unpartitioned_parquet/60798d96ba630184_549959007_data.1.parq | 178.52MB | |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/unpartitioned_parquet/60798d96ba630185_549959007_data.0.parq | 255.37MB | |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/unpartitioned_parquet/60798d96ba630185_549959007_data.1.parq | 57.71MB | |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/unpartitioned_parquet/60798d96ba630186_2141167244_data.0.parq | 255.40MB | |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/unpartitioned_parquet/60798d96ba630186_2141167244_data.1.parq | 175.52MB | |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/unpartitioned_parquet/60798d96ba630187_1006832086_data.0.parq | 255.40MB | |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/unpartitioned_parquet/60798d96ba630187_1006832086_data.1.parq | 214.61MB | |
++----------------------------------------------------------------------------------------------------+----------+-----------+
+</codeblock>
+
+ <p>
+ The following example shows a <codeph>SHOW FILES</codeph> statement for a partitioned text table
+ with data in two different partitions, and two empty partitions.
+ The partitions with no data are not represented in the <codeph>SHOW FILES</codeph> output.
+ </p>
+<codeblock>[localhost:21000] > create table partitioned_text (x bigint, y int, s string) partitioned by (year bigint, month bigint, day bigint);
+[localhost:21000] > insert overwrite partitioned_text (x, y, s) partition (year=2014,month=1,day=1) select id, val, name from oreilly.normalized_parquet
+where id between 1 and 1000000;
+[localhost:21000] > insert overwrite partitioned_text (x, y, s) partition (year=2014,month=1,day=2) select id, val, name from oreilly.normalized_parquet
+where id between 1000001 and 2000000;
+[localhost:21000] > alter table partitioned_text add partition (year=2014,month=1,day=3);
+[localhost:21000] > alter table partitioned_text add partition (year=2014,month=1,day=4);
+[localhost:21000] > show partitions partitioned_text;
++-------+-------+-----+-------+--------+---------+--------------+-------------------+--------+-------------------+
+| year | month | day | #Rows | #Files | Size | Bytes Cached | Cache Replication | Format | Incremental stats |
++-------+-------+-----+-------+--------+---------+--------------+-------------------+--------+-------------------+
+| 2014 | 1 | 1 | -1 | 4 | 25.16MB | NOT CACHED | NOT CACHED | TEXT | false |
+| 2014 | 1 | 2 | -1 | 4 | 26.22MB | NOT CACHED | NOT CACHED | TEXT | false |
+| 2014 | 1 | 3 | -1 | 0 | 0B | NOT CACHED | NOT CACHED | TEXT | false |
+| 2014 | 1 | 4 | -1 | 0 | 0B | NOT CACHED | NOT CACHED | TEXT | false |
+| Total | | | -1 | 8 | 51.38MB | 0B | | | |
++-------+-------+-----+-------+--------+---------+--------------+-------------------+--------+-------------------+
+[localhost:21000] > show files in partitioned_text;
++----------------------------------------------------------------------------------------------------------------+--------+-------------------------+
+| path | size | partition |
++----------------------------------------------------------------------------------------------------------------+--------+-------------------------+
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/partitioned_text/year=2014/month=1/day=1/80732d9dc80689f_1418645991_data.0. | 5.77MB | year=2014/month=1/day=1 |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/partitioned_text/year=2014/month=1/day=1/80732d9dc8068a0_1418645991_data.0. | 6.25MB | year=2014/month=1/day=1 |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/partitioned_text/year=2014/month=1/day=1/80732d9dc8068a1_147082319_data.0. | 7.16MB | year=2014/month=1/day=1 |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/partitioned_text/year=2014/month=1/day=1/80732d9dc8068a2_2111411753_data.0. | 5.98MB | year=2014/month=1/day=1 |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/partitioned_text/year=2014/month=1/day=2/21a828cf494b5bbb_501271652_data.0. | 6.42MB | year=2014/month=1/day=2 |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/partitioned_text/year=2014/month=1/day=2/21a828cf494b5bbc_501271652_data.0. | 6.62MB | year=2014/month=1/day=2 |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/partitioned_text/year=2014/month=1/day=2/21a828cf494b5bbd_1393490200_data.0. | 6.98MB | year=2014/month=1/day=2 |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/partitioned_text/year=2014/month=1/day=2/21a828cf494b5bbe_1393490200_data.0. | 6.20MB | year=2014/month=1/day=2 |
++----------------------------------------------------------------------------------------------------------------+--------+-------------------------+
+</codeblock>
+ <p>
+ The following example shows a <codeph>SHOW FILES</codeph> statement for a partitioned Parquet table.
+ The number and sizes of files are different from the equivalent partitioned text table
+ used in the previous example, because <codeph>INSERT</codeph> operations for Parquet tables
+ are parallelized differently than for text tables. (Also, the amount of data is so small
+ that it can be written to Parquet without involving all the hosts in this 4-node cluster.)
+ </p>
+<codeblock>[localhost:21000] > create table partitioned_parquet (x bigint, y int, s string) partitioned by (year bigint, month bigint, day bigint) stored as parquet;
+[localhost:21000] > insert into partitioned_parquet partition (year,month,day) select x, y, s, year, month, day from partitioned_text;
+[localhost:21000] > show partitions partitioned_parquet;
++-------+-------+-----+-------+--------+---------+--------------+-------------------+---------+-------------------+
+| year | month | day | #Rows | #Files | Size | Bytes Cached | Cache Replication | Format | Incremental stats |
++-------+-------+-----+-------+--------+---------+--------------+-------------------+---------+-------------------+
+| 2014 | 1 | 1 | -1 | 3 | 17.89MB | NOT CACHED | NOT CACHED | PARQUET | false |
+| 2014 | 1 | 2 | -1 | 3 | 17.89MB | NOT CACHED | NOT CACHED | PARQUET | false |
+| Total | | | -1 | 6 | 35.79MB | 0B | | | |
++-------+-------+-----+-------+--------+---------+--------------+-------------------+---------+-------------------+
+[localhost:21000] > show files in partitioned_parquet;
++---------------------------------------------------------------------------------------------------------+--------+-------------------------+
+| path | size | partition |
++---------------------------------------------------------------------------------------------------------+--------+-------------------------+
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/partitioned_parquet/year=2014/month=1/day=1/1134113650_data.0.parq | 4.49MB | year=2014/month=1/day=1 |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/partitioned_parquet/year=2014/month=1/day=1/617567880_data.0.parq | 5.14MB | year=2014/month=1/day=1 |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/partitioned_parquet/year=2014/month=1/day=1/2099499416_data.0.parq | 8.27MB | year=2014/month=1/day=1 |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/partitioned_parquet/year=2014/month=1/day=2/945567189_data.0.parq | 8.80MB | year=2014/month=1/day=2 |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/partitioned_parquet/year=2014/month=1/day=2/2145850112_data.0.parq | 4.80MB | year=2014/month=1/day=2 |
+| hdfs://<varname>impala_data_dir</varname>/show_files.db/partitioned_parquet/year=2014/month=1/day=2/665613448_data.0.parq | 4.29MB | year=2014/month=1/day=2 |
++---------------------------------------------------------------------------------------------------------+--------+-------------------------+
+</codeblock>
+<p>
+ The following example shows output from the <codeph>SHOW FILES</codeph> statement
+ for a table where the data files are stored in Amazon S3:
+</p>
+<codeblock>[localhost:21000] > show files in s3_testing.sample_data_s3;
++-----------------------------------------------------------------------+---------+-----------+
+| path | size | partition |
++-----------------------------------------------------------------------+---------+-----------+
+| s3a://impala-demo/sample_data/e065453cba1988a6_1733868553_data.0.parq | 24.84MB | |
++-----------------------------------------------------------------------+---------+-----------+
+</codeblock>
+<!--
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_authorization.xml#authorization"/>
+ </p>
+-->
+ </conbody>
+ </concept>
+
+ <concept rev="2.0.0" id="show_roles">
+
+ <title>SHOW ROLES Statement</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Sentry"/>
+ <data name="Category" value="Roles"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ The <codeph>SHOW ROLES</codeph> statement displays roles. This syntax is available in CDH 5.2 and later
+ only, when you are using the Sentry authorization framework along with the Sentry service, as described in
+ <xref href="impala_authorization.xml#sentry_service"/>. It does not apply when you use the Sentry framework
+ with privileges defined in a policy file.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/security_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/show_security"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ Depending on the roles set up within your organization by the <codeph>CREATE ROLE</codeph> statement, the
+ output might look something like this:
+ </p>
+
+<codeblock>show roles;
++-----------+
+| role_name |
++-----------+
+| analyst |
+| role1 |
+| sales |
+| superuser |
+| test_role |
++-----------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_authorization.xml#authorization"/>
+ </p>
+ </conbody>
+ </concept>
+
+ <concept rev="2.0.0" id="show_current_role">
+
+ <title>SHOW CURRENT ROLE</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Sentry"/>
+ <data name="Category" value="Roles"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p rev="2.0.0">
+ The <codeph>SHOW CURRENT ROLE</codeph> statement displays roles assigned to the current user. This syntax
+ is available in CDH 5.2 and later only, when you are using the Sentry authorization framework along with
+ the Sentry service, as described in <xref href="impala_authorization.xml#sentry_service"/>. It does not
+ apply when you use the Sentry framework with privileges defined in a policy file.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/security_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/show_security"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ Depending on the roles set up within your organization by the <codeph>CREATE ROLE</codeph> statement, the
+ output might look something like this:
+ </p>
+
+<codeblock>show current roles;
++-----------+
+| role_name |
++-----------+
+| role1 |
+| superuser |
++-----------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_authorization.xml#authorization"/>
+ </p>
+ </conbody>
+ </concept>
+
+ <concept id="show_role_grant">
+
+ <title>SHOW ROLE GRANT Statement</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Sentry"/>
+ <data name="Category" value="Roles"/>
+ </metadata>
+ </prolog>
+
+
+ <conbody>
+
+ <p rev="2.0.0">
+<!-- Copied from Sentry docs. Turn into conref. I did some rewording for clarity. -->
+ The <codeph>SHOW ROLE GRANT</codeph> statement lists all the roles assigned to the specified group. This
+ statement is only allowed for Sentry administrative users and others users that are part of the specified
+ group. This syntax is available in CDH 5.2 and later only, when you are using the Sentry authorization
+ framework along with the Sentry service, as described in
+ <xref href="impala_authorization.xml#sentry_service"/>. It does not apply when you use the Sentry framework
+ with privileges defined in a policy file.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/security_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/show_security"/>
+
+<!--
+<p conref="../shared/impala_common.xml#common/example_blurb"/>
+<codeblock>To do: construct example for SHOW ROLE GRANT</codeblock>
+-->
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_authorization.xml#authorization"/>
+ </p>
+ </conbody>
+ </concept>
+
+ <concept rev="2.0.0" id="show_grant_role">
+
+ <title>SHOW GRANT ROLE Statement</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Sentry"/>
+ <data name="Category" value="Roles"/>
+ </metadata>
+ </prolog>
+
+
+ <conbody>
+
+ <p>
+<!-- Copied from Sentry docs. Turn into conref. I did some rewording for clarity. -->
+ The <codeph>SHOW GRANT ROLE</codeph> statement list all the grants for the given role name. This statement
+ is only allowed for Sentry administrative users and other users that have been granted the specified role.
+ This syntax is available in CDH 5.2 and later only, when you are using the Sentry authorization framework
+ along with the Sentry service, as described in <xref href="impala_authorization.xml#sentry_service"/>. It
+ does not apply when you use the Sentry framework with privileges defined in a policy file.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/security_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/show_security"/>
+
+<!--
+<p conref="../shared/impala_common.xml#common/example_blurb"/>
+<codeblock>To do: construct example for SHOW GRANT ROLE</codeblock>
+-->
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_authorization.xml#authorization"/>
+ </p>
+ </conbody>
+ </concept>
+
+ <concept id="show_databases">
+
+ <title>SHOW DATABASES</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Databases"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ The <codeph>SHOW DATABASES</codeph> statement is often the first one you issue when connecting to an
+ instance for the first time. You typically issue <codeph>SHOW DATABASES</codeph> to see the names you can
+ specify in a <codeph>USE <varname>db_name</varname></codeph> statement, then after switching to a database
+ you issue <codeph>SHOW TABLES</codeph> to see the names you can specify in <codeph>SELECT</codeph> and
+ <codeph>INSERT</codeph> statements.
+ </p>
+
+ <p>
+ The output of <codeph>SHOW DATABASES</codeph> includes the special <codeph>_impala_builtins</codeph>
+ database, which lets you view definitions of built-in functions, as described under <codeph>SHOW
+ FUNCTIONS</codeph>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/security_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/show_security"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ This example shows how you might locate a particular table on an unfamiliar system. The
+ <codeph>DEFAULT</codeph> database is the one you initially connect to; a database with that name is present
+ on every system. You can issue <codeph>SHOW TABLES IN <varname>db_name</varname></codeph> without going
+ into a database, or <codeph>SHOW TABLES</codeph> once you are inside a particular database.
+ </p>
+
+<codeblock>[localhost:21000] > show databases;
++--------------------+
+| name |
++--------------------+
+| _impala_builtins |
+| analyze_testing |
+| avro |
+| ctas |
+| d1 |
+| d2 |
+| d3 |
+| default |
+| file_formats |
+| hbase |
+| load_data |
+| partitioning |
+| regexp_testing |
+| reports |
+| temporary |
++--------------------+
+Returned 14 row(s) in 0.02s
+[localhost:21000] > show tables in file_formats;
++--------------------+
+| name |
++--------------------+
+| parquet_table |
+| rcfile_table |
+| sequencefile_table |
+| textfile_table |
++--------------------+
+Returned 4 row(s) in 0.01s
+[localhost:21000] > use file_formats;
+[localhost:21000] > show tables like '*parq*';
++--------------------+
+| name |
++--------------------+
+| parquet_table |
++--------------------+
+Returned 1 row(s) in 0.01s</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_databases.xml#databases"/>, <xref href="impala_create_database.xml#create_database"/>,
+ <xref href="impala_drop_database.xml#drop_database"/>, <xref href="impala_use.xml#use"/>
+ <xref href="impala_show.xml#show_tables"/>,
+ <xref href="impala_show.xml#show_functions"/>
+ </p>
+ </conbody>
+ </concept>
+
+ <concept id="show_tables">
+
+ <title>SHOW TABLES Statement</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Tables"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ Displays the names of tables. By default, lists tables in the current database, or with the
+ <codeph>IN</codeph> clause, in a specified database. By default, lists all tables, or with the
+ <codeph>LIKE</codeph> clause, only those whose name match a pattern with <codeph>*</codeph> wildcards.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/security_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/show_security"/>
+
+ <p rev="CDH-19187">
+ The user ID that the <cmdname>impalad</cmdname> daemon runs under,
+ typically the <codeph>impala</codeph> user, must have read and execute
+ permissions for all directories that are part of the table.
+ (A table could span multiple different HDFS directories if it is partitioned.
+ The directories could be widely scattered because a partition can reside
+ in an arbitrary HDFS directory based on its <codeph>LOCATION</codeph> attribute.)
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following examples demonstrate the <codeph>SHOW TABLES</codeph> statement.
+ If the database contains no tables, the result set is empty.
+ If the database does contain tables, <codeph>SHOW TABLES IN <varname>db_name</varname></codeph>
+ lists all the table names. <codeph>SHOW TABLES</codeph> with no qualifiers lists
+ all the table names in the current database.
+ </p>
+
+<codeblock>create database empty_db;
+show tables in empty_db;
+Fetched 0 row(s) in 0.11s
+
+create database full_db;
+create table full_db.t1 (x int);
+create table full_db.t2 like full_db.t1;
+
+show tables in full_db;
++------+
+| name |
++------+
+| t1 |
+| t2 |
++------+
+
+use full_db;
+show tables;
++------+
+| name |
++------+
+| t1 |
+| t2 |
++------+
+</codeblock>
+
+ <p>
+ This example demonstrates how <codeph>SHOW TABLES LIKE '<varname>wildcard_pattern</varname>'</codeph>
+ lists table names that match a pattern, or multiple alternative patterns.
+ The ability to do wildcard matches for table names makes it helpful to establish naming conventions for tables to
+ conveniently locate a group of related tables.
+ </p>
+
+<codeblock>create table fact_tbl (x int);
+create table dim_tbl_1 (s string);
+create table dim_tbl_2 (s string);
+
+/* Asterisk is the wildcard character. Only 2 out of the 3 just-created tables are returned. */
+show tables like 'dim*';
++-----------+
+| name |
++-----------+
+| dim_tbl_1 |
+| dim_tbl_2 |
++-----------+
+
+/* We are already in the FULL_DB database, but just to be sure we can specify the database name also. */
+show tables in full_db like 'dim*';
++-----------+
+| name |
++-----------+
+| dim_tbl_1 |
+| dim_tbl_2 |
++-----------+
+
+/* The pipe character separates multiple wildcard patterns. */
+show tables like '*dim*|t*';
++-----------+
+| name |
++-----------+
+| dim_tbl_1 |
+| dim_tbl_2 |
+| t1 |
+| t2 |
++-----------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_tables.xml#tables"/>, <xref href="impala_create_table.xml#create_table"/>,
+ <xref href="impala_alter_table.xml#alter_table"/>, <xref href="impala_drop_table.xml#drop_table"/>,
+ <xref href="impala_describe.xml#describe"/>, <xref href="impala_show.xml#show_create_table"/>,
+ <xref href="impala_show.xml#show_table_stats"/>,
+ <xref href="impala_show.xml#show_databases"/>,
+ <xref href="impala_show.xml#show_functions"/>
+ </p>
+ </conbody>
+ </concept>
+
+ <concept rev="1.2.1" id="show_create_table">
+
+ <title>SHOW CREATE TABLE Statement</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Schemas"/>
+ <data name="Category" value="Impala Data Types"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ As a schema changes over time, you might run a <codeph>CREATE TABLE</codeph> statement followed by several
+ <codeph>ALTER TABLE</codeph> statements. To capture the cumulative effect of all those statements,
+ <codeph>SHOW CREATE TABLE</codeph> displays a <codeph>CREATE TABLE</codeph> statement that would reproduce
+ the current structure of a table. You can use this output in scripts that set up or clone a group of
+ tables, rather than trying to reproduce the original sequence of <codeph>CREATE TABLE</codeph> and
+ <codeph>ALTER TABLE</codeph> statements. When creating variations on the original table, or cloning the
+ original table on a different system, you might need to edit the <codeph>SHOW CREATE TABLE</codeph> output
+ to change things such as the database name, <codeph>LOCATION</codeph> field, and so on that might be
+ different on the destination system.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/security_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/show_security"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example shows how various clauses from the <codeph>CREATE TABLE</codeph> statement are
+ represented in the output of <codeph>SHOW CREATE TABLE</codeph>.
+ </p>
+
+<codeblock>create table show_create_table_demo (id int comment "Unique ID", y double, s string)
+ partitioned by (year smallint)
+ stored as parquet;
+
+show create table show_create_table_demo;
++----------------------------------------------------------------------------------------+
+| result |
++----------------------------------------------------------------------------------------+
+| CREATE TABLE scratch.show_create_table_demo ( |
+| id INT COMMENT 'Unique ID', |
+| y DOUBLE, |
+| s STRING |
+| ) |
+| PARTITIONED BY ( |
+| year SMALLINT |
+| ) |
+| STORED AS PARQUET |
+| LOCATION 'hdfs://127.0.0.1:8020/user/hive/warehouse/scratch.db/show_create_table_demo' |
+| TBLPROPERTIES ('transient_lastDdlTime'='1418152582') |
++----------------------------------------------------------------------------------------+
+</codeblock>
+
+ <p>
+ The following example shows how, after a sequence of <codeph>ALTER TABLE</codeph> statements, the output
+ from <codeph>SHOW CREATE TABLE</codeph> represents the current state of the table. This output could be
+ used to create a matching table rather than executing the original <codeph>CREATE TABLE</codeph> and
+ sequence of <codeph>ALTER TABLE</codeph> statements.
+ </p>
+
+<codeblock>alter table show_create_table_demo drop column s;
+alter table show_create_table_demo set fileformat textfile;
+
+show create table show_create_table_demo;
++----------------------------------------------------------------------------------------+
+| result |
++----------------------------------------------------------------------------------------+
+| CREATE TABLE scratch.show_create_table_demo ( |
+| id INT COMMENT 'Unique ID', |
+| y DOUBLE |
+| ) |
+| PARTITIONED BY ( |
+| year SMALLINT |
+| ) |
+| STORED AS TEXTFILE |
+| LOCATION 'hdfs://127.0.0.1:8020/user/hive/warehouse/demo.db/show_create_table_demo' |
+| TBLPROPERTIES ('transient_lastDdlTime'='1418152638') |
++----------------------------------------------------------------------------------------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_create_table.xml#create_table"/>, <xref href="impala_describe.xml#describe"/>,
+ <xref href="impala_show.xml#show_tables"/>
+ </p>
+ </conbody>
+ </concept>
+
+ <concept id="show_table_stats">
+
+ <title>SHOW TABLE STATS Statement</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Performance"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ The <codeph>SHOW TABLE STATS</codeph> and <codeph>SHOW COLUMN STATS</codeph> variants are important for
+ tuning performance and diagnosing performance issues, especially with the largest tables and the most
+ complex join queries.
+ </p>
+
+ <p>
+ Any values that are not available (because the <codeph>COMPUTE STATS</codeph> statement has not been run
+ yet) are displayed as <codeph>-1</codeph>.
+ </p>
+
+ <p>
+ <codeph>SHOW TABLE STATS</codeph> provides some general information about the table, such as the number of
+ files, overall size of the data, whether some or all of the data is in the HDFS cache, and the file format,
+ that is useful whether or not you have run the <codeph>COMPUTE STATS</codeph> statement. A
+ <codeph>-1</codeph> in the <codeph>#Rows</codeph> output column indicates that the <codeph>COMPUTE
+ STATS</codeph> statement has never been run for this table. If the table is partitioned, <codeph>SHOW TABLE
+ STATS</codeph> provides this information for each partition. (It produces the same output as the
+ <codeph>SHOW PARTITIONS</codeph> statement in this case.)
+ </p>
+
+ <p>
+ The output of <codeph>SHOW COLUMN STATS</codeph> is primarily only useful after the <codeph>COMPUTE
+ STATS</codeph> statement has been run on the table. A <codeph>-1</codeph> in the <codeph>#Distinct
+ Values</codeph> output column indicates that the <codeph>COMPUTE STATS</codeph> statement has never been
+ run for this table. Currently, Impala always leaves the <codeph>#Nulls</codeph> column as
+ <codeph>-1</codeph>, even after <codeph>COMPUTE STATS</codeph> has been run.
+ </p>
+
+ <p>
+ These <codeph>SHOW</codeph> statements work on actual tables only, not on views.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/security_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/show_security"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following examples show how the <codeph>SHOW TABLE STATS</codeph> statement displays physical
+ information about a table and the associated data files:
+ </p>
+
+<codeblock>show table stats store_sales;
++-------+--------+----------+--------------+--------+-------------------+
+| #Rows | #Files | Size | Bytes Cached | Format | Incremental stats |
++-------+--------+----------+--------------+--------+-------------------+
+| -1 | 1 | 370.45MB | NOT CACHED | TEXT | false |
++-------+--------+----------+--------------+--------+-------------------+
+
+show table stats customer;
++-------+--------+---------+--------------+--------+-------------------+
+| #Rows | #Files | Size | Bytes Cached | Format | Incremental stats |
++-------+--------+---------+--------------+--------+-------------------+
+| -1 | 1 | 12.60MB | NOT CACHED | TEXT | false |
++-------+--------+---------+--------------+--------+-------------------+
+</codeblock>
+
+ <p>
+ The following example shows how, after a <codeph>COMPUTE STATS</codeph> or <codeph>COMPUTE INCREMENTAL
+ STATS</codeph> statement, the <codeph>#Rows</codeph> field is now filled in. Because the
+ <codeph>STORE_SALES</codeph> table in this example is not partitioned, the <codeph>COMPUTE INCREMENTAL
+ STATS</codeph> statement produces regular stats rather than incremental stats, therefore the
+ <codeph>Incremental stats</codeph> field remains <codeph>false</codeph>.
+ </p>
+
+<codeblock>compute stats customer;
++------------------------------------------+
+| summary |
++------------------------------------------+
+| Updated 1 partition(s) and 18 column(s). |
++------------------------------------------+
+
+show table stats customer;
++--------+--------+---------+--------------+--------+-------------------+
+| #Rows | #Files | Size | Bytes Cached | Format | Incremental stats |
++--------+--------+---------+--------------+--------+-------------------+
+| 100000 | 1 | 12.60MB | NOT CACHED | TEXT | false |
++--------+--------+---------+--------------+--------+-------------------+
+
+compute incremental stats store_sales;
++------------------------------------------+
+| summary |
++------------------------------------------+
+| Updated 1 partition(s) and 23 column(s). |
++------------------------------------------+
+
+show table stats store_sales;
++---------+--------+----------+--------------+--------+-------------------+
+| #Rows | #Files | Size | Bytes Cached | Format | Incremental stats |
++---------+--------+----------+--------------+--------+-------------------+
+| 2880404 | 1 | 370.45MB | NOT CACHED | TEXT | false |
++---------+--------+----------+--------------+--------+-------------------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
+ <p rev="CDH-19187">
+ The user ID that the <cmdname>impalad</cmdname> daemon runs under,
+ typically the <codeph>impala</codeph> user, must have read and execute
+ permissions for all directories that are part of the table.
+ (A table could span multiple different HDFS directories if it is partitioned.
+ The directories could be widely scattered because a partition can reside
+ in an arbitrary HDFS directory based on its <codeph>LOCATION</codeph> attribute.)
+ The Impala user must also have execute
+ permission for the database directory, and any parent directories of the database directory in HDFS.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_compute_stats.xml#compute_stats"/>, <xref href="impala_show.xml#show_column_stats"/>
+ </p>
+
+ <p>
+ See <xref href="impala_perf_stats.xml#perf_stats"/> for usage information and examples.
+ </p>
+ </conbody>
+ </concept>
+
+ <concept id="show_column_stats">
+
+ <title>SHOW COLUMN STATS Statement</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Performance"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ The <codeph>SHOW TABLE STATS</codeph> and <codeph>SHOW COLUMN STATS</codeph> variants are important for
+ tuning performance and diagnosing performance issues, especially with the largest tables and the most
+ complex join queries.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/security_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/show_security"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following examples show the output of the <codeph>SHOW COLUMN STATS</codeph> statement for some tables,
+ before the <codeph>COMPUTE STATS</codeph> statement is run. Impala deduces some information, such as
+ maximum and average size for fixed-length columns, and leaves and unknown values as <codeph>-1</codeph>.
+ </p>
+
+<codeblock>show column stats customer;
++------------------------+--------+------------------+--------+----------+----------+
+| Column | Type | #Distinct Values | #Nulls | Max Size | Avg Size |
++------------------------+--------+------------------+--------+----------+----------+
+| c_customer_sk | INT | -1 | -1 | 4 | 4 |
+| c_customer_id | STRING | -1 | -1 | -1 | -1 |
+| c_current_cdemo_sk | INT | -1 | -1 | 4 | 4 |
+| c_current_hdemo_sk | INT | -1 | -1 | 4 | 4 |
+| c_current_addr_sk | INT | -1 | -1 | 4 | 4 |
+| c_first_shipto_date_sk | INT | -1 | -1 | 4 | 4 |
+| c_first_sales_date_sk | INT | -1 | -1 | 4 | 4 |
+| c_salutation | STRING | -1 | -1 | -1 | -1 |
+| c_first_name | STRING | -1 | -1 | -1 | -1 |
+| c_last_name | STRING | -1 | -1 | -1 | -1 |
+| c_preferred_cust_flag | STRING | -1 | -1 | -1 | -1 |
+| c_birth_day | INT | -1 | -1 | 4 | 4 |
+| c_birth_month | INT | -1 | -1 | 4 | 4 |
+| c_birth_year | INT | -1 | -1 | 4 | 4 |
+| c_birth_country | STRING | -1 | -1 | -1 | -1 |
+| c_login | STRING | -1 | -1 | -1 | -1 |
+| c_email_address | STRING | -1 | -1 | -1 | -1 |
+| c_last_review_date | STRING | -1 | -1 | -1 | -1 |
++------------------------+--------+------------------+--------+----------+----------+
+
+show column stats store_sales;
++-----------------------+-------+------------------+--------+----------+----------+
+| Column | Type | #Distinct Values | #Nulls | Max Size | Avg Size |
++-----------------------+-------+------------------+--------+----------+----------+
+| ss_sold_date_sk | INT | -1 | -1 | 4 | 4 |
+| ss_sold_time_sk | INT | -1 | -1 | 4 | 4 |
+| ss_item_sk | INT | -1 | -1 | 4 | 4 |
+| ss_customer_sk | INT | -1 | -1 | 4 | 4 |
+| ss_cdemo_sk | INT | -1 | -1 | 4 | 4 |
+| ss_hdemo_sk | INT | -1 | -1 | 4 | 4 |
+| ss_addr_sk | INT | -1 | -1 | 4 | 4 |
+| ss_store_sk | INT | -1 | -1 | 4 | 4 |
+| ss_promo_sk | INT | -1 | -1 | 4 | 4 |
+| ss_ticket_number | INT | -1 | -1 | 4 | 4 |
+| ss_quantity | INT | -1 | -1 | 4 | 4 |
+| ss_wholesale_cost | FLOAT | -1 | -1 | 4 | 4 |
+| ss_list_price | FLOAT | -1 | -1 | 4 | 4 |
+| ss_sales_price | FLOAT | -1 | -1 | 4 | 4 |
+| ss_ext_discount_amt | FLOAT | -1 | -1 | 4 | 4 |
+| ss_ext_sales_price | FLOAT | -1 | -1 | 4 | 4 |
+| ss_ext_wholesale_cost | FLOAT | -1 | -1 | 4 | 4 |
+| ss_ext_list_price | FLOAT | -1 | -1 | 4 | 4 |
+| ss_ext_tax | FLOAT | -1 | -1 | 4 | 4 |
+| ss_coupon_amt | FLOAT | -1 | -1 | 4 | 4 |
+| ss_net_paid | FLOAT | -1 | -1 | 4 | 4 |
+| ss_net_paid_inc_tax | FLOAT | -1 | -1 | 4 | 4 |
+| ss_net_profit | FLOAT | -1 | -1 | 4 | 4 |
++-----------------------+-------+------------------+--------+----------+----------+
+</codeblock>
+
+ <p>
+ The following examples show the output of the <codeph>SHOW COLUMN STATS</codeph> statement for some tables,
+ after the <codeph>COMPUTE STATS</codeph> statement is run. Now most of the <codeph>-1</codeph> values are
+ changed to reflect the actual table data. The <codeph>#Nulls</codeph> column remains <codeph>-1</codeph>
+ because Impala does not use the number of <codeph>NULL</codeph> values to influence query planning.
+ </p>
+
+<codeblock>compute stats customer;
++------------------------------------------+
+| summary |
++------------------------------------------+
+| Updated 1 partition(s) and 18 column(s). |
++------------------------------------------+
+
+compute stats store_sales;
++------------------------------------------+
+| summary |
++------------------------------------------+
+| Updated 1 partition(s) and 23 column(s). |
++------------------------------------------+
+
+show column stats customer;
++------------------------+--------+------------------+--------+----------+--------+
+| Column | Type | #Distinct Values | #Nulls | Max Size | Avg Size
++------------------------+--------+------------------+--------+----------+--------+
+| c_customer_sk | INT | 139017 | -1 | 4 | 4 |
+| c_customer_id | STRING | 111904 | -1 | 16 | 16 |
+| c_current_cdemo_sk | INT | 95837 | -1 | 4 | 4 |
+| c_current_hdemo_sk | INT | 8097 | -1 | 4 | 4 |
+| c_current_addr_sk | INT | 57334 | -1 | 4 | 4 |
+| c_first_shipto_date_sk | INT | 4374 | -1 | 4 | 4 |
+| c_first_sales_date_sk | INT | 4409 | -1 | 4 | 4 |
+| c_salutation | STRING | 7 | -1 | 4 | 3.1308 |
+| c_first_name | STRING | 3887 | -1 | 11 | 5.6356 |
+| c_last_name | STRING | 4739 | -1 | 13 | 5.9106 |
+| c_preferred_cust_flag | STRING | 3 | -1 | 1 | 0.9656 |
+| c_birth_day | INT | 31 | -1 | 4 | 4 |
+| c_birth_month | INT | 12 | -1 | 4 | 4 |
+| c_birth_year | INT | 71 | -1 | 4 | 4 |
+| c_birth_country | STRING | 205 | -1 | 20 | 8.4001 |
+| c_login | STRING | 1 | -1 | 0 | 0 |
+| c_email_address | STRING | 94492 | -1 | 46 | 26.485 |
+| c_last_review_date | STRING | 349 | -1 | 7 | 6.7561 |
++------------------------+--------+------------------+--------+----------+--------+
+
+show column stats store_sales;
++-----------------------+-------+------------------+--------+----------+----------+
+| Column | Type | #Distinct Values | #Nulls | Max Size | Avg Size |
++-----------------------+-------+------------------+--------+----------+----------+
+| ss_sold_date_sk | INT | 4395 | -1 | 4 | 4 |
+| ss_sold_time_sk | INT | 63617 | -1 | 4 | 4 |
+| ss_item_sk | INT | 19463 | -1 | 4 | 4 |
+| ss_customer_sk | INT | 122720 | -1 | 4 | 4 |
+| ss_cdemo_sk | INT | 242982 | -1 | 4 | 4 |
+| ss_hdemo_sk | INT | 8097 | -1 | 4 | 4 |
+| ss_addr_sk | INT | 70770 | -1 | 4 | 4 |
+| ss_store_sk | INT | 6 | -1 | 4 | 4 |
+| ss_promo_sk | INT | 355 | -1 | 4 | 4 |
+| ss_ticket_number | INT | 304098 | -1 | 4 | 4 |
+| ss_quantity | INT | 105 | -1 | 4 | 4 |
+| ss_wholesale_cost | FLOAT | 9600 | -1 | 4 | 4 |
+| ss_list_price | FLOAT | 22191 | -1 | 4 | 4 |
+| ss_sales_price | FLOAT | 20693 | -1 | 4 | 4 |
+| ss_ext_discount_amt | FLOAT | 228141 | -1 | 4 | 4 |
+| ss_ext_sales_price | FLOAT | 433550 | -1 | 4 | 4 |
+| ss_ext_wholesale_cost | FLOAT | 406291 | -1 | 4 | 4 |
+| ss_ext_list_price | FLOAT | 574871 | -1 | 4 | 4 |
+| ss_ext_tax | FLOAT | 91806 | -1 | 4 | 4 |
+| ss_coupon_amt | FLOAT | 228141 | -1 | 4 | 4 |
+| ss_net_paid | FLOAT | 493107 | -1 | 4 | 4 |
+| ss_net_paid_inc_tax | FLOAT | 653523 | -1 | 4 | 4 |
+| ss_net_profit | FLOAT | 611934 | -1 | 4 | 4 |
++-----------------------+-------+------------------+--------+----------+----------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
+ <p rev="CDH-19187">
+ The user ID that the <cmdname>impalad</cmdname> daemon runs under,
+ typically the <codeph>impala</codeph> user, must have read and execute
+ permissions for all directories that are part of the table.
+ (A table could span multiple different HDFS directories if it is partitioned.
+ The directories could be widely scattered because a partition can reside
+ in an arbitrary HDFS directory based on its <codeph>LOCATION</codeph> attribute.)
+ The Impala user must also have execute
+ permission for the database directory, and any parent directories of the database directory in HDFS.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_compute_stats.xml#compute_stats"/>, <xref href="impala_show.xml#show_table_stats"/>
+ </p>
+
+ <p>
+ See <xref href="impala_perf_stats.xml#perf_stats"/> for usage information and examples.
+ </p>
+ </conbody>
+ </concept>
+
+ <concept rev="1.4.0" id="show_partitions">
+
+ <title>SHOW PARTITIONS Statement</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Schemas"/>
+ <!-- At some point, need to figure out categories related to partitioning. (Partitioned Tables etc.) -->
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <codeph>SHOW PARTITIONS</codeph> displays information about each partition for a partitioned table. (The
+ output is the same as the <codeph>SHOW TABLE STATS</codeph> statement, but <codeph>SHOW PARTITIONS</codeph>
+ only works on a partitioned table.) Because it displays table statistics for all partitions, the output is
+ more informative if you have run the <codeph>COMPUTE STATS</codeph> statement after creating all the
+ partitions. See <xref href="impala_compute_stats.xml#compute_stats"/> for details. For example, on a
+ <codeph>CENSUS</codeph> table partitioned on the <codeph>YEAR</codeph> column:
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/security_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/show_security"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock rev="1.4.0">[localhost:21000] > show partitions census;
++-------+-------+--------+------+---------+
+| year | #Rows | #Files | Size | Format |
++-------+-------+--------+------+---------+
+| 2000 | -1 | 0 | 0B | TEXT |
+| 2004 | -1 | 0 | 0B | TEXT |
+| 2008 | -1 | 0 | 0B | TEXT |
+| 2010 | -1 | 0 | 0B | TEXT |
+| 2011 | 4 | 1 | 22B | TEXT |
+| 2012 | 4 | 1 | 22B | TEXT |
+| 2013 | 1 | 1 | 231B | PARQUET |
+| Total | 9 | 3 | 275B | |
++-------+-------+--------+------+---------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
+ <p rev="CDH-19187">
+ The user ID that the <cmdname>impalad</cmdname> daemon runs under,
+ typically the <codeph>impala</codeph> user, must have read and execute
+ permissions for all directories that are part of the table.
+ (A table could span multiple different HDFS directories if it is partitioned.
+ The directories could be widely scattered because a partition can reside
+ in an arbitrary HDFS directory based on its <codeph>LOCATION</codeph> attribute.)
+ The Impala user must also have execute
+ permission for the database directory, and any parent directories of the database directory in HDFS.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ See <xref href="impala_perf_stats.xml#perf_stats"/> for usage information and examples.
+ </p>
+
+ <p>
+ <xref href="impala_show.xml#show_table_stats"/>, <xref href="impala_partitioning.xml#partitioning"/>
+ </p>
+ </conbody>
+ </concept>
+
+ <concept rev="1.3.0" id="show_functions">
+
+ <title>SHOW FUNCTIONS Statement</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="UDFs"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ By default, <codeph>SHOW FUNCTIONS</codeph> displays user-defined functions (UDFs) and <codeph>SHOW
+ AGGREGATE FUNCTIONS</codeph> displays user-defined aggregate functions (UDAFs) associated with a particular
+ database. The output from <codeph>SHOW FUNCTIONS</codeph> includes the argument signature of each function.
+ You specify this argument signature as part of the <codeph>DROP FUNCTION</codeph> statement. You might have
+ several UDFs with the same name, each accepting different argument data types.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/security_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/show_security"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ To display Impala built-in functions, specify the special database name <codeph>_impala_builtins</codeph>:
+ </p>
+
+<codeblock>show functions in _impala_builtins;
++----------------+----------------------------------------+
+| return type | signature |
++----------------+----------------------------------------+
+| BOOLEAN | ifnull(BOOLEAN, BOOLEAN) |
+| TINYINT | ifnull(TINYINT, TINYINT) |
+| SMALLINT | ifnull(SMALLINT, SMALLINT) |
+| INT | ifnull(INT, INT) |
+...
+
+show functions in _impala_builtins like '*week*';
++-------------+------------------------------+
+| return type | signature |
++-------------+------------------------------+
+| INT | weekofyear(TIMESTAMP) |
+| TIMESTAMP | weeks_add(TIMESTAMP, INT) |
+| TIMESTAMP | weeks_add(TIMESTAMP, BIGINT) |
+| TIMESTAMP | weeks_sub(TIMESTAMP, INT) |
+| TIMESTAMP | weeks_sub(TIMESTAMP, BIGINT) |
+| INT | dayofweek(TIMESTAMP) |
++-------------+------------------------------+
+</codeblock>
+
+ <p>
+ To search for functions that use a particular data type, specify a case-sensitive data type name in all
+ capitals:
+ </p>
+
+<codeblock>show functions in _impala_builtins like '*BIGINT*';
++----------------------------------------+
+| name |
++----------------------------------------+
+| adddate(TIMESTAMP, BIGINT) |
+| bin(BIGINT) |
+| coalesce(BIGINT...) |
+...</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_functions_overview.xml#functions"/>, <xref href="impala_functions.xml#builtins"/>,
+ <xref href="impala_udf.xml#udfs"/>,
+ <xref href="impala_show.xml#show_databases"/>,
+ <xref href="impala_show.xml#show_tables"/>
+ </p>
+ </conbody>
+ </concept>
+
+ <concept rev="someday" audience="Cloudera" id="show_data_sources">
+
+ <title>SHOW DATA SOURCES Statement (CDH x.y and later only)</title>
+
+ <conbody>
+
+ <p>
+ <codeph>SHOW DATA SOURCES</codeph> lists the external data sources defined by the <codeph>CREATE DATA
+ SOURCE</codeph> statement. To show only those names matching a pattern, use the <codeph>LIKE</codeph>
+ clause with asterisks for wildcards, for example <codeph>SHOW DATA SOURCES LIKE '*sql*'</codeph>. These
+ data sources are global, not associated with a specific Impala database, so there is no <codeph>IN</codeph>
+ clause as in most other kinds of objects.
+ </p>
+
+<!--
+<p conref="../shared/impala_common.xml#common/example_blurb"/>
+<codeblock>To do: construct example for SHOW DATA SOURCES when that statement is externalized</codeblock>
+-->
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p></p>
+ </conbody>
+ </concept>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_smallint.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_smallint.xml b/docs/topics/impala_smallint.xml
new file mode 100644
index 0000000..3aae9ad
--- /dev/null
+++ b/docs/topics/impala_smallint.xml
@@ -0,0 +1,101 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="smallint">
+
+ <title>SMALLINT Data Type</title>
+ <titlealts><navtitle>SMALLINT</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Data Types"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ A 2-byte integer data type used in <codeph>CREATE TABLE</codeph> and <codeph>ALTER TABLE</codeph> statements.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+ <p>
+ In the column definition of a <codeph>CREATE TABLE</codeph> statement:
+ </p>
+
+<codeblock><varname>column_name</varname> SMALLINT</codeblock>
+
+ <p>
+ <b>Range:</b> -32768 .. 32767. There is no <codeph>UNSIGNED</codeph> subtype.
+ </p>
+
+ <p>
+ <b>Conversions:</b> Impala automatically converts to a larger integer type (<codeph>INT</codeph> or
+ <codeph>BIGINT</codeph>) or a floating-point type (<codeph>FLOAT</codeph> or <codeph>DOUBLE</codeph>)
+ automatically. Use <codeph>CAST()</codeph> to convert to <codeph>TINYINT</codeph>, <codeph>STRING</codeph>,
+ or <codeph>TIMESTAMP</codeph>.
+ <ph conref="../shared/impala_common.xml#common/cast_int_to_timestamp"/>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ For a convenient and automated way to check the bounds of the <codeph>SMALLINT</codeph> type, call the
+ functions <codeph>MIN_SMALLINT()</codeph> and <codeph>MAX_SMALLINT()</codeph>.
+ </p>
+
+ <p>
+ If an integer value is too large to be represented as a <codeph>SMALLINT</codeph>, use an
+ <codeph>INT</codeph> instead.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/null_bad_numeric_cast"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>CREATE TABLE t1 (x SMALLINT);
+SELECT CAST(1000 AS SMALLINT);
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/parquet_blurb"/>
+
+<!-- Duplicated under TINYINT and SMALLINT. Turn into a conref in both places. -->
+
+ <p rev="1.4.0">
+ Physically, Parquet files represent <codeph>TINYINT</codeph> and <codeph>SMALLINT</codeph> values as 32-bit
+ integers. Although Impala rejects attempts to insert out-of-range values into such columns, if you create a
+ new table with the <codeph>CREATE TABLE ... LIKE PARQUET</codeph> syntax, any <codeph>TINYINT</codeph> or
+ <codeph>SMALLINT</codeph> columns in the original table turn into <codeph>INT</codeph> columns in the new
+ table.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/partitioning_good"/>
+
+ <p conref="../shared/impala_common.xml#common/hbase_ok"/>
+
+ <p conref="../shared/impala_common.xml#common/text_bulky"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/compatibility_blurb"/> -->
+
+ <p conref="../shared/impala_common.xml#common/internals_2_bytes"/>
+
+ <p conref="../shared/impala_common.xml#common/added_forever"/>
+
+ <p conref="../shared/impala_common.xml#common/column_stats_constant"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/restrictions_blurb"/> -->
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_literals.xml#numeric_literals"/>, <xref href="impala_tinyint.xml#tinyint"/>,
+ <xref href="impala_smallint.xml#smallint"/>, <xref href="impala_int.xml#int"/>,
+ <xref href="impala_bigint.xml#bigint"/>, <xref href="impala_decimal.xml#decimal"/>,
+ <xref href="impala_math_functions.xml#math_functions"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_stddev.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_stddev.xml b/docs/topics/impala_stddev.xml
new file mode 100644
index 0000000..0cdff45
--- /dev/null
+++ b/docs/topics/impala_stddev.xml
@@ -0,0 +1,116 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.4" id="stddev">
+
+ <title>STDDEV, STDDEV_SAMP, STDDEV_POP Functions</title>
+ <titlealts><navtitle>STDDEV, STDDEV_SAMP, STDDEV_POP</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="Aggregate Functions"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">stddev() function</indexterm>
+ <indexterm audience="Cloudera">stddev_samp() function</indexterm>
+ <indexterm audience="Cloudera">stddev_pop() function</indexterm>
+ An aggregate function that
+ <xref href="http://en.wikipedia.org/wiki/Standard_deviation" scope="external" format="html">standard
+ deviation</xref> of a set of numbers.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>{ STDDEV | STDDEV_SAMP | STDDEV_POP } ([DISTINCT | ALL] <varname>expression</varname>)</codeblock>
+
+ <p>
+ This function works with any numeric data type.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/former_odd_return_type_string"/>
+
+ <p>
+ This function is typically used in mathematical formulas related to probability distributions.
+ </p>
+
+ <p>
+ The <codeph>STDDEV_POP()</codeph> and <codeph>STDDEV_SAMP()</codeph> functions compute the population
+ standard deviation and sample standard deviation, respectively, of the input values.
+ (<codeph>STDDEV()</codeph> is an alias for <codeph>STDDEV_SAMP()</codeph>.) Both functions evaluate all input
+ rows matched by the query. The difference is that <codeph>STDDEV_SAMP()</codeph> is scaled by
+ <codeph>1/(N-1)</codeph> while <codeph>STDDEV_POP()</codeph> is scaled by <codeph>1/N</codeph>.
+ </p>
+
+ <p>
+ If no input rows match the query, the result of any of these functions is <codeph>NULL</codeph>. If a single
+ input row matches the query, the result of any of these functions is <codeph>"0.0"</codeph>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ This example demonstrates how <codeph>STDDEV()</codeph> and <codeph>STDDEV_SAMP()</codeph> return the same
+ result, while <codeph>STDDEV_POP()</codeph> uses a slightly different calculation to reflect that the input
+ data is considered part of a larger <q>population</q>.
+ </p>
+
+<codeblock>[localhost:21000] > select stddev(score) from test_scores;
++---------------+
+| stddev(score) |
++---------------+
+| 28.5 |
++---------------+
+[localhost:21000] > select stddev_samp(score) from test_scores;
++--------------------+
+| stddev_samp(score) |
++--------------------+
+| 28.5 |
++--------------------+
+[localhost:21000] > select stddev_pop(score) from test_scores;
++-------------------+
+| stddev_pop(score) |
++-------------------+
+| 28.4858 |
++-------------------+
+</codeblock>
+
+ <p>
+ This example demonstrates that, because the return value of these aggregate functions is a
+ <codeph>STRING</codeph>, you must currently convert the result with <codeph>CAST</codeph>.
+ </p>
+
+<codeblock>[localhost:21000] > create table score_stats as select cast(stddev(score) as decimal(7,4)) `standard_deviation`, cast(variance(score) as decimal(7,4)) `variance` from test_scores;
++-------------------+
+| summary |
++-------------------+
+| Inserted 1 row(s) |
++-------------------+
+[localhost:21000] > desc score_stats;
++--------------------+--------------+---------+
+| name | type | comment |
++--------------------+--------------+---------+
+| standard_deviation | decimal(7,4) | |
+| variance | decimal(7,4) | |
++--------------------+--------------+---------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/analytic_not_allowed_caveat"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ The <codeph>STDDEV()</codeph>, <codeph>STDDEV_POP()</codeph>, and <codeph>STDDEV_SAMP()</codeph> functions
+ compute the standard deviation (square root of the variance) based on the results of
+ <codeph>VARIANCE()</codeph>, <codeph>VARIANCE_POP()</codeph>, and <codeph>VARIANCE_SAMP()</codeph>
+ respectively. See <xref href="impala_variance.xml#variance"/> for details about the variance property.
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_string.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_string.xml b/docs/topics/impala_string.xml
new file mode 100644
index 0000000..9ad77c3
--- /dev/null
+++ b/docs/topics/impala_string.xml
@@ -0,0 +1,161 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="string">
+
+ <title>STRING Data Type</title>
+ <titlealts><navtitle>STRING</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Data Types"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ A data type used in <codeph>CREATE TABLE</codeph> and <codeph>ALTER TABLE</codeph> statements.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+ <p>
+ In the column definition of a <codeph>CREATE TABLE</codeph> statement:
+ </p>
+
+<codeblock><varname>column_name</varname> STRING</codeblock>
+
+ <p>
+ <b>Length:</b> Maximum of 32,767 bytes. Do not use any length constraint when declaring
+ <codeph>STRING</codeph> columns, as you might be familiar with from <codeph>VARCHAR</codeph>,
+ <codeph>CHAR</codeph>, or similar column types from relational database systems. <ph rev="2.0.0">If you do
+ need to manipulate string values with precise or maximum lengths, in Impala 2.0 and higher you can declare
+ columns as <codeph>VARCHAR(<varname>max_length</varname>)</codeph> or
+ <codeph>CHAR(<varname>length</varname>)</codeph>, but for best performance use <codeph>STRING</codeph>
+ where practical.</ph>
+ </p>
+
+ <p>
+ <b>Character sets:</b> For full support in all Impala subsystems, restrict string values to the ASCII
+ character set. UTF-8 character data can be stored in Impala and retrieved through queries, but UTF-8 strings
+ containing non-ASCII characters are not guaranteed to work properly with string manipulation functions,
+ comparison operators, or the <codeph>ORDER BY</codeph> clause. For any national language aspects such as
+ collation order or interpreting extended ASCII variants such as ISO-8859-1 or ISO-8859-2 encodings, Impala
+ does not include such metadata with the table definition. If you need to sort, manipulate, or display data
+ depending on those national language characteristics of string data, use logic on the application side.
+ </p>
+
+ <p>
+ <b>Conversions:</b>
+ </p>
+
+ <ul>
+ <li>
+ <p>
+ Impala does not automatically convert <codeph>STRING</codeph> to any numeric type. Impala does
+ automatically convert <codeph>STRING</codeph> to <codeph>TIMESTAMP</codeph> if the value matches one of
+ the accepted <codeph>TIMESTAMP</codeph> formats; see <xref href="impala_timestamp.xml#timestamp"/> for
+ details.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ You can use <codeph>CAST()</codeph> to convert <codeph>STRING</codeph> values to
+ <codeph>TINYINT</codeph>, <codeph>SMALLINT</codeph>, <codeph>INT</codeph>, <codeph>BIGINT</codeph>,
+ <codeph>FLOAT</codeph>, <codeph>DOUBLE</codeph>, or <codeph>TIMESTAMP</codeph>.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ You cannot directly cast a <codeph>STRING</codeph> value to <codeph>BOOLEAN</codeph>. You can use a
+ <codeph>CASE</codeph> expression to evaluate string values such as <codeph>'T'</codeph>,
+ <codeph>'true'</codeph>, and so on and return Boolean <codeph>true</codeph> and <codeph>false</codeph>
+ values as appropriate.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ You can cast a <codeph>BOOLEAN</codeph> value to <codeph>STRING</codeph>, returning <codeph>'1'</codeph>
+ for <codeph>true</codeph> values and <codeph>'0'</codeph> for <codeph>false</codeph> values.
+ </p>
+ </li>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/partitioning_blurb"/>
+
+ <p>
+ Although it might be convenient to use <codeph>STRING</codeph> columns for partition keys, even when those
+ columns contain numbers, for performance and scalability it is much better to use numeric columns as
+ partition keys whenever practical. Although the underlying HDFS directory name might be the same in either
+ case, the in-memory storage for the partition key columns is more compact, and computations are faster, if
+ partition key columns such as <codeph>YEAR</codeph>, <codeph>MONTH</codeph>, <codeph>DAY</codeph> and so on
+ are declared as <codeph>INT</codeph>, <codeph>SMALLINT</codeph>, and so on.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/zero_length_strings"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/hbase_blurb"/> -->
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/parquet_blurb"/> -->
+
+ <p conref="../shared/impala_common.xml#common/text_bulky"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/compatibility_blurb"/> -->
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/internals_blurb"/> -->
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/added_in_20"/> -->
+
+ <p conref="../shared/impala_common.xml#common/column_stats_variable"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/restrictions_blurb"/> -->
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following examples demonstrate double-quoted and single-quoted string literals, and required escaping for
+ quotation marks within string literals:
+ </p>
+
+<codeblock>SELECT 'I am a single-quoted string';
+SELECT "I am a double-quoted string";
+SELECT 'I\'m a single-quoted string with an apostrophe';
+SELECT "I\'m a double-quoted string with an apostrophe";
+SELECT 'I am a "short" single-quoted string containing quotes';
+SELECT "I am a \"short\" double-quoted string containing quotes";
+</codeblock>
+
+ <p>
+ The following examples demonstrate calls to string manipulation functions to concatenate strings, convert
+ numbers to strings, or pull out substrings:
+ </p>
+
+<codeblock>SELECT CONCAT("Once upon a time, there were ", CAST(3 AS STRING), ' little pigs.');
+SELECT SUBSTR("hello world",7,5);
+</codeblock>
+
+ <p>
+ The following examples show how to perform operations on <codeph>STRING</codeph> columns within a table:
+ </p>
+
+<codeblock>CREATE TABLE t1 (s1 STRING, s2 STRING);
+INSERT INTO t1 VALUES ("hello", 'world'), (CAST(7 AS STRING), "wonders");
+SELECT s1, s2, length(s1) FROM t1 WHERE s2 LIKE 'w%';
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_literals.xml#string_literals"/>, <xref href="impala_char.xml#char"/>,
+ <xref href="impala_varchar.xml#varchar"/>, <xref href="impala_string_functions.xml#string_functions"/>,
+ <xref href="impala_datetime_functions.xml#datetime_functions"/>
+ </p>
+ </conbody>
+</concept>
[11/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Posted by jr...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_functions.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_functions.xml b/docs/topics/impala_functions.xml
new file mode 100644
index 0000000..527744b
--- /dev/null
+++ b/docs/topics/impala_functions.xml
@@ -0,0 +1,162 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="builtins">
+
+ <title id="title_functions">Impala Built-In Functions</title>
+ <titlealts><navtitle>Built-In Functions</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <draft-comment translate="no">
+Opportunity to conref some material between here and the "Functions" topic under "Schema Objects".
+</draft-comment>
+
+ <p>
+ Impala supports several categories of built-in functions. These functions let you perform mathematical
+ calculations, string manipulation, date calculations, and other kinds of data transformations directly in
+ <codeph>SELECT</codeph> statements. The built-in functions let a SQL query return results with all
+ formatting, calculating, and type conversions applied, rather than performing time-consuming postprocessing
+ in another application. By applying function calls where practical, you can make a SQL query that is as
+ convenient as an expression in a procedural programming language or a formula in a spreadsheet.
+ </p>
+
+ <p>
+ The categories of functions supported by Impala are:
+ </p>
+
+ <ul>
+ <li>
+ <xref href="impala_math_functions.xml#math_functions"/>
+ </li>
+
+ <li>
+ <xref href="impala_conversion_functions.xml#conversion_functions"/>
+ </li>
+
+ <li>
+ <xref href="impala_datetime_functions.xml#datetime_functions"/>
+ </li>
+
+ <li>
+ <xref href="impala_conditional_functions.xml#conditional_functions"/>
+ </li>
+
+ <li>
+ <xref href="impala_string_functions.xml#string_functions"/>
+ </li>
+
+ <li>
+ Aggregation functions, explained in <xref href="impala_aggregate_functions.xml#aggregate_functions"/>.
+ </li>
+ </ul>
+
+ <p>
+ You call any of these functions through the <codeph>SELECT</codeph> statement. For most functions, you can
+ omit the <codeph>FROM</codeph> clause and supply literal values for any required arguments:
+ </p>
+
+<codeblock>select abs(-1);
++---------+
+| abs(-1) |
++---------+
+| 1 |
++---------+
+
+select concat('The rain ', 'in Spain');
++---------------------------------+
+| concat('the rain ', 'in spain') |
++---------------------------------+
+| The rain in Spain |
++---------------------------------+
+
+select power(2,5);
++-------------+
+| power(2, 5) |
++-------------+
+| 32 |
++-------------+
+</codeblock>
+
+ <p>
+ When you use a <codeph>FROM</codeph> clause and specify a column name as a function argument, the function is
+ applied for each item in the result set:
+ </p>
+
+<!-- TK: make real output for these; change the queries if necessary to use tables I already have. -->
+
+<codeblock>select concat('Country = ',country_code) from all_countries where population > 100000000;
+select round(price) as dollar_value from product_catalog where price between 0.0 and 100.0;
+</codeblock>
+
+ <p>
+ Typically, if any argument to a built-in function is <codeph>NULL</codeph>, the result value is also
+ <codeph>NULL</codeph>:
+ </p>
+
+<codeblock>select cos(null);
++-----------+
+| cos(null) |
++-----------+
+| NULL |
++-----------+
+
+select power(2,null);
++----------------+
+| power(2, null) |
++----------------+
+| NULL |
++----------------+
+
+select concat('a',null,'b');
++------------------------+
+| concat('a', null, 'b') |
++------------------------+
+| NULL |
++------------------------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/aggr1"/>
+
+<codeblock conref="../shared/impala_common.xml#common/aggr2"/>
+
+ <p conref="../shared/impala_common.xml#common/aggr3"/>
+
+ <p>
+ Aggregate functions are a special category with different rules. These functions calculate a return value
+ across all the items in a result set, so they do require a <codeph>FROM</codeph> clause in the query:
+ </p>
+
+<!-- TK: make real output for these; change the queries if necessary to use tables I already have. -->
+
+<codeblock>select count(product_id) from product_catalog;
+select max(height), avg(height) from census_data where age > 20;
+</codeblock>
+
+ <p>
+ Aggregate functions also ignore <codeph>NULL</codeph> values rather than returning a <codeph>NULL</codeph>
+ result. For example, if some rows have <codeph>NULL</codeph> for a particular column, those rows are ignored
+ when computing the AVG() for that column. Likewise, specifying <codeph>COUNT(col_name)</codeph> in a query
+ counts only those rows where <codeph>col_name</codeph> contains a non-<codeph>NULL</codeph> value.
+ </p>
+
+ <p rev="2.0.0">
+ Analytic functions are a variation on aggregate functions. Instead of returning a single value, or an
+ identical value for each group of rows, they can compute values that vary based on a <q>window</q> consisting
+ of of other rows around them in the result set.
+ </p>
+
+ <p outputclass="toc"/>
+
+ </conbody>
+
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_functions_overview.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_functions_overview.xml b/docs/topics/impala_functions_overview.xml
new file mode 100644
index 0000000..26a4d35
--- /dev/null
+++ b/docs/topics/impala_functions_overview.xml
@@ -0,0 +1,116 @@
+<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="functions">
+
+ <title>Overview of Impala Functions</title>
+ <titlealts><navtitle>Functions</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ Functions let you apply arithmetic, string, or other computations and transformations to Impala data. You
+ typically use them in <codeph>SELECT</codeph> lists and <codeph>WHERE</codeph> clauses to filter and format
+ query results so that the result set is exactly what you want, with no further processing needed on the
+ application side.
+ </p>
+
+ <p>
+ Scalar functions return a single result for each input row. See <xref href="impala_functions.xml#builtins"/>.
+ </p>
+
+<codeblock>[localhost:21000] > select name, population from country where continent = 'North America' order by population desc limit 4;
+[localhost:21000] > select upper(name), population from country where continent = 'North America' order by population desc limit 4;
++-------------+------------+
+| upper(name) | population |
++-------------+------------+
+| USA | 320000000 |
+| MEXICO | 122000000 |
+| CANADA | 25000000 |
+| GUATEMALA | 16000000 |
++-------------+------------+
+</codeblock>
+ <p>
+ Aggregate functions combine the results from multiple rows:
+ either a single result for the entire table, or a separate result for each group of rows.
+ Aggregate functions are frequently used in combination with <codeph>GROUP BY</codeph>
+ and <codeph>HAVING</codeph> clauses in the <codeph>SELECT</codeph> statement.
+ See <xref href="impala_aggregate_functions.xml#aggregate_functions"/>.
+ </p>
+
+<codeblock>[localhost:21000] > select continent, <b>sum(population)</b> as howmany from country <b>group by continent</b> order by howmany desc;
++---------------+------------+
+| continent | howmany |
++---------------+------------+
+| Asia | 4298723000 |
+| Africa | 1110635000 |
+| Europe | 742452000 |
+| North America | 565265000 |
+| South America | 406740000 |
+| Oceania | 38304000 |
++---------------+------------+
+</codeblock>
+
+ <p>
+ User-defined functions (UDFs) let you code your own logic. They can be either scalar or aggregate functions.
+ UDFs let you implement important business or scientific logic using high-performance code for Impala to automatically parallelize.
+ You can also use UDFs to implement convenience functions to simplify reporting or porting SQL from other database systems.
+ See <xref href="impala_udf.xml#udfs"/>.
+ </p>
+
+<codeblock>[localhost:21000] > select <b>rot13('Hello world!')</b> as 'Weak obfuscation';
++------------------+
+| weak obfuscation |
++------------------+
+| Uryyb jbeyq! |
++------------------+
+[localhost:21000] > select <b>likelihood_of_new_subatomic_particle(sensor1, sensor2, sensor3)</b> as probability
+ > from experimental_results group by experiment;
+</codeblock>
+
+ <p>
+ Each function is associated with a specific database. For example, if you issue a <codeph>USE somedb</codeph>
+ statement followed by <codeph>CREATE FUNCTION somefunc</codeph>, the new function is created in the
+ <codeph>somedb</codeph> database, and you could refer to it through the fully qualified name
+ <codeph>somedb.somefunc</codeph>. You could then issue another <codeph>USE</codeph> statement
+ and create a function with the same name in a different database.
+ </p>
+
+ <p>
+ Impala built-in functions are associated with a special database named <codeph>_impala_builtins</codeph>,
+ which lets you refer to them from any database without qualifying the name.
+ </p>
+
+<codeblock>[localhost:21000] > show databases;
++-------------------------+
+| name |
++-------------------------+
+| <b>_impala_builtins</b> |
+| analytic_functions |
+| avro_testing |
+| data_file_size |
+...
+[localhost:21000] > show functions in _impala_builtins like '*subs*';
++-------------+-----------------------------------+
+| return type | signature |
++-------------+-----------------------------------+
+| STRING | substr(STRING, BIGINT) |
+| STRING | substr(STRING, BIGINT, BIGINT) |
+| STRING | substring(STRING, BIGINT) |
+| STRING | substring(STRING, BIGINT, BIGINT) |
++-------------+-----------------------------------+
+</codeblock>
+
+ <p>
+ <b>Related statements:</b> <xref href="impala_create_function.xml#create_function"/>,
+ <xref href="impala_drop_function.xml#drop_function"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_grant.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_grant.xml b/docs/topics/impala_grant.xml
new file mode 100644
index 0000000..6aad41e
--- /dev/null
+++ b/docs/topics/impala_grant.xml
@@ -0,0 +1,117 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="2.0.0" id="grant">
+
+ <title>GRANT Statement (CDH 5.2 or higher only)</title>
+ <titlealts><navtitle>GRANT (CDH 5.2 or higher only)</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="DDL"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Sentry"/>
+ <data name="Category" value="Roles"/>
+ <!-- Consider whether to go deeper into categories like Security for the Sentry-related statements. -->
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">GRANT statement</indexterm>
+<!-- Copied from Sentry docs. Turn into conref. I did some rewording for clarity. -->
+ The <codeph>GRANT</codeph> statement grants roles or privileges on specified objects to groups. Only Sentry
+ administrative users can grant roles to a group.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock rev="2.3.0 collevelauth">GRANT ROLE <varname>role_name</varname> TO GROUP <varname>group_name</varname>
+
+GRANT <varname>privilege</varname> ON <varname>object_type</varname> <varname>object_name</varname>
+ TO [ROLE] <varname>roleName</varname>
+ [WITH GRANT OPTION]
+
+<ph rev="2.3.0">privilege ::= SELECT | SELECT(<varname>column_name</varname>) | INSERT | ALL</ph>
+object_type ::= TABLE | DATABASE | SERVER | URI
+</codeblock>
+
+ <p>
+ Typically, the object name is an identifier. For URIs, it is a string literal.
+ </p>
+
+<!-- Turn privilege info into a conref or series of conrefs. (In both GRANT and REVOKE.) -->
+
+ <p conref="../shared/impala_common.xml#common/privileges_blurb"/>
+
+ <p>
+<!-- To do: The wording here can be fluid, and it's reused in several statements. Turn into a conref. -->
+ Only administrative users (initially, a predefined set of users specified in the Sentry service configuration
+ file) can use this statement.
+ </p>
+
+ <p>
+ The <codeph>WITH GRANT OPTION</codeph> clause allows members of the specified role to issue
+ <codeph>GRANT</codeph> and <codeph>REVOKE</codeph> statements for those same privileges
+<!-- Copied from Sentry docs. Turn into conref. I did some rewording for clarity. -->
+ Hence, if a role has the <codeph>ALL</codeph> privilege on a database and the <codeph>WITH GRANT
+ OPTION</codeph> set, users granted that role can execute <codeph>GRANT</codeph>/<codeph>REVOKE</codeph>
+ statements only for that database or child tables of the database. This means a user could revoke the
+ privileges of the user that provided them the <codeph>GRANT OPTION</codeph>.
+ </p>
+
+ <p>
+<!-- Copied from Sentry docs. Turn into conref. Except I changed Hive to Impala. -->
+ Impala does not currently support revoking only the <codeph>WITH GRANT OPTION</codeph> from a privilege
+ previously granted to a role. To remove the <codeph>WITH GRANT OPTION</codeph>, revoke the privilege and
+ grant it again without the <codeph>WITH GRANT OPTION</codeph> flag.
+ </p>
+
+ <p rev="2.3.0 collevelauth">
+ The ability to grant or revoke <codeph>SELECT</codeph> privilege on specific columns is available
+ in CDH 5.5 / Impala 2.3 and higher. See <xref href="sg_hive_sql.xml#concept_c2q_4qx_p4/col_level_auth_sentry"/>
+ for details.
+ </p>
+
+<!-- Turn compatibility info into a conref or series of conrefs. (In both GRANT and REVOKE.) -->
+
+<!-- If they diverge during development, consider the version here in GRANT the authoritative one. -->
+
+ <p conref="../shared/impala_common.xml#common/compatibility_blurb"/>
+
+ <p>
+ <ul>
+ <li>
+ The Impala <codeph>GRANT</codeph> and <codeph>REVOKE</codeph> statements are available in CDH 5.2 and
+ later.
+ </li>
+
+ <li>
+ In CDH 5.1 and later, Impala can make use of any roles and privileges specified by the
+ <codeph>GRANT</codeph> and <codeph>REVOKE</codeph> statements in Hive, when your system is configured to
+ use the Sentry service instead of the file-based policy mechanism.
+ </li>
+
+ <li>
+ The Impala <codeph>GRANT</codeph> and <codeph>REVOKE</codeph> statements for privileges do not require
+ the <codeph>ROLE</codeph> keyword to be repeated before each role name, unlike the equivalent Hive
+ statements.
+ </li>
+
+ <li conref="../shared/impala_common.xml#common/grant_revoke_single"/>
+ </ul>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_authorization.xml#authorization"/>, <xref href="impala_revoke.xml#revoke"/>,
+ <xref href="impala_create_role.xml#create_role"/>, <xref href="impala_drop_role.xml#drop_role"/>,
+ <xref href="impala_show.xml#show"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_group_by.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_group_by.xml b/docs/topics/impala_group_by.xml
new file mode 100644
index 0000000..10b7de4
--- /dev/null
+++ b/docs/topics/impala_group_by.xml
@@ -0,0 +1,137 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="group_by">
+
+ <title>GROUP BY Clause</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Aggregate Functions"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ Specify the <codeph>GROUP BY</codeph> clause in queries that use aggregation functions, such as
+ <codeph><xref href="impala_count.xml#count">COUNT()</xref></codeph>,
+ <codeph><xref href="impala_sum.xml#sum">SUM()</xref></codeph>,
+ <codeph><xref href="impala_avg.xml#avg">AVG()</xref></codeph>,
+ <codeph><xref href="impala_min.xml#min">MIN()</xref></codeph>, and
+ <codeph><xref href="impala_max.xml#max">MAX()</xref></codeph>. Specify in the
+ <codeph><xref href="impala_group_by.xml#group_by">GROUP BY</xref></codeph> clause the names of all the
+ columns that do not participate in the aggregation operation.
+ </p>
+
+ <!-- Good to show an example of cases where ORDER BY does and doesn't work with complex types. -->
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p rev="2.3.0">
+ In CDH 5.5 / Impala 2.3 and higher, the complex data types <codeph>STRUCT</codeph>,
+ <codeph>ARRAY</codeph>, and <codeph>MAP</codeph> are available. These columns cannot
+ be referenced directly in the <codeph>ORDER BY</codeph> clause.
+ When you query a complex type column, you use join notation to <q>unpack</q> the elements
+ of the complex type, and within the join query you can include an <codeph>ORDER BY</codeph>
+ clause to control the order in the result set of the scalar elements from the complex type.
+ See <xref href="impala_complex_types.xml#complex_types"/> for details about Impala support for complex types.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/zero_length_strings"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ For example, the following query finds the 5 items that sold the highest total quantity (using the
+ <codeph>SUM()</codeph> function, and also counts the number of sales transactions for those items (using the
+ <codeph>COUNT()</codeph> function). Because the column representing the item IDs is not used in any
+ aggregation functions, we specify that column in the <codeph>GROUP BY</codeph> clause.
+ </p>
+
+<codeblock>select
+ <b>ss_item_sk</b> as Item,
+ <b>count</b>(ss_item_sk) as Times_Purchased,
+ <b>sum</b>(ss_quantity) as Total_Quantity_Purchased
+from store_sales
+ <b>group by ss_item_sk</b>
+ order by sum(ss_quantity) desc
+ limit 5;
++-------+-----------------+--------------------------+
+| item | times_purchased | total_quantity_purchased |
++-------+-----------------+--------------------------+
+| 9325 | 372 | 19072 |
+| 4279 | 357 | 18501 |
+| 7507 | 371 | 18475 |
+| 5953 | 369 | 18451 |
+| 16753 | 375 | 18446 |
++-------+-----------------+--------------------------+</codeblock>
+
+ <p>
+ The <codeph>HAVING</codeph> clause lets you filter the results of aggregate functions, because you cannot
+ refer to those expressions in the <codeph>WHERE</codeph> clause. For example, to find the 5 lowest-selling
+ items that were included in at least 100 sales transactions, we could use this query:
+ </p>
+
+<codeblock>select
+ <b>ss_item_sk</b> as Item,
+ <b>count</b>(ss_item_sk) as Times_Purchased,
+ <b>sum</b>(ss_quantity) as Total_Quantity_Purchased
+from store_sales
+ <b>group by ss_item_sk</b>
+ <b>having times_purchased >= 100</b>
+ order by sum(ss_quantity)
+ limit 5;
++-------+-----------------+--------------------------+
+| item | times_purchased | total_quantity_purchased |
++-------+-----------------+--------------------------+
+| 13943 | 105 | 4087 |
+| 2992 | 101 | 4176 |
+| 4773 | 107 | 4204 |
+| 14350 | 103 | 4260 |
+| 11956 | 102 | 4275 |
++-------+-----------------+--------------------------+</codeblock>
+
+ <p>
+ When performing calculations involving scientific or financial data, remember that columns with type
+ <codeph>FLOAT</codeph> or <codeph>DOUBLE</codeph> are stored as true floating-point numbers, which cannot
+ precisely represent every possible fractional value. Thus, if you include a <codeph>FLOAT</codeph> or
+ <codeph>DOUBLE</codeph> column in a <codeph>GROUP BY</codeph> clause, the results might not precisely match
+ literal values in your query or from an original Text data file. Use rounding operations, the
+ <codeph>BETWEEN</codeph> operator, or another arithmetic technique to match floating-point values that are
+ <q>near</q> literal values you expect. For example, this query on the <codeph>ss_wholesale_cost</codeph>
+ column returns cost values that are close but not identical to the original figures that were entered as
+ decimal fractions.
+ </p>
+
+<codeblock>select ss_wholesale_cost, avg(ss_quantity * ss_sales_price) as avg_revenue_per_sale
+ from sales
+ group by ss_wholesale_cost
+ order by avg_revenue_per_sale desc
+ limit 5;
++-------------------+----------------------+
+| ss_wholesale_cost | avg_revenue_per_sale |
++-------------------+----------------------+
+| 96.94000244140625 | 4454.351539300434 |
+| 95.93000030517578 | 4423.119941283189 |
+| 98.37999725341797 | 4332.516490316291 |
+| 97.97000122070312 | 4330.480601655014 |
+| 98.52999877929688 | 4291.316953108634 |
++-------------------+----------------------+</codeblock>
+
+ <p>
+ Notice how wholesale cost values originally entered as decimal fractions such as <codeph>96.94</codeph> and
+ <codeph>98.38</codeph> are slightly larger or smaller in the result set, due to precision limitations in the
+ hardware floating-point types. The imprecise representation of <codeph>FLOAT</codeph> and
+ <codeph>DOUBLE</codeph> values is why financial data processing systems often store currency using data types
+ that are less space-efficient but avoid these types of rounding errors.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+ <p>
+ <xref href="impala_select.xml#select"/>,
+ <xref href="impala_aggregate_functions.xml#aggregate_functions"/>
+ </p>
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_group_concat.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_group_concat.xml b/docs/topics/impala_group_concat.xml
new file mode 100644
index 0000000..b2a7ff6
--- /dev/null
+++ b/docs/topics/impala_group_concat.xml
@@ -0,0 +1,133 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.2" id="group_concat">
+
+ <title>GROUP_CONCAT Function</title>
+ <titlealts><navtitle>GROUP_CONCAT</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="Aggregate Functions"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">group_concat() function</indexterm>
+ An aggregate function that returns a single string representing the argument value concatenated together for
+ each row of the result set. If the optional separator string is specified, the separator is added between
+ each pair of concatenated values. The default separator is a comma followed by a space.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<!-- Might allow DISTINCT at some point. Check: does it allow ALL now? -->
+
+<codeblock>GROUP_CONCAT([ALL] <varname>expression</varname> [, <varname>separator</varname>])</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/concat_blurb"/>
+
+ <p>
+ By default, returns a single string covering the whole result set. To include other columns or values in the
+ result set, or to produce multiple concatenated strings for subsets of rows, include a <codeph>GROUP
+ BY</codeph> clause in the query.
+ </p>
+
+ <p>
+ <b>Return type:</b> <codeph>STRING</codeph>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <p>
+ You cannot apply the <codeph>DISTINCT</codeph> operator to the argument of this function.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/analytic_not_allowed_caveat"/>
+
+ <p>
+ Currently, Impala returns an error if the result value grows larger than 1 GiB.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following examples illustrate various aspects of the <codeph>GROUP_CONCAT()</codeph> function.
+ </p>
+
+ <p>
+ You can call the function directly on a <codeph>STRING</codeph> column. To use it with a numeric column, cast
+ the value to <codeph>STRING</codeph>.
+ </p>
+
+<codeblock>[localhost:21000] > create table t1 (x int, s string);
+[localhost:21000] > insert into t1 values (1, "one"), (3, "three"), (2, "two"), (1, "one");
+[localhost:21000] > select group_concat(s) from t1;
++----------------------+
+| group_concat(s) |
++----------------------+
+| one, three, two, one |
++----------------------+
+[localhost:21000] > select group_concat(cast(x as string)) from t1;
++---------------------------------+
+| group_concat(cast(x as string)) |
++---------------------------------+
+| 1, 3, 2, 1 |
++---------------------------------+
+</codeblock>
+
+ <p>
+ The optional separator lets you format the result in flexible ways. The separator can be an arbitrary string
+ expression, not just a single character.
+ </p>
+
+<codeblock>[localhost:21000] > select group_concat(s,"|") from t1;
++----------------------+
+| group_concat(s, '|') |
++----------------------+
+| one|three|two|one |
++----------------------+
+[localhost:21000] > select group_concat(s,'---') from t1;
++-------------------------+
+| group_concat(s, '---') |
++-------------------------+
+| one---three---two---one |
++-------------------------+
+</codeblock>
+
+ <p>
+ The default separator is a comma followed by a space. To get a comma-delimited result without extra spaces,
+ specify a delimiter character that is only a comma.
+ </p>
+
+<codeblock>[localhost:21000] > select group_concat(s,',') from t1;
++----------------------+
+| group_concat(s, ',') |
++----------------------+
+| one,three,two,one |
++----------------------+
+</codeblock>
+
+ <p>
+ Including a <codeph>GROUP BY</codeph> clause lets you produce a different concatenated result for each group
+ in the result set. In this example, the only <codeph>X</codeph> value that occurs more than once is
+ <codeph>1</codeph>, so that is the only row in the result set where <codeph>GROUP_CONCAT()</codeph> returns a
+ delimited value. For groups containing a single value, <codeph>GROUP_CONCAT()</codeph> returns the original
+ value of its <codeph>STRING</codeph> argument.
+ </p>
+
+<codeblock>[localhost:21000] > select x, group_concat(s) from t1 group by x;
++---+-----------------+
+| x | group_concat(s) |
++---+-----------------+
+| 2 | two |
+| 3 | three |
+| 1 | one, one |
++---+-----------------+
+</codeblock>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_having.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_having.xml b/docs/topics/impala_having.xml
new file mode 100644
index 0000000..064a4a8
--- /dev/null
+++ b/docs/topics/impala_having.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="having">
+
+ <title>HAVING Clause</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Aggregate Functions"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ Performs a filter operation on a <codeph>SELECT</codeph> query, by examining the results of aggregation
+ functions rather than testing each individual table row. Therefore, it is always used in conjunction with a
+ function such as <codeph><xref href="impala_count.xml#count">COUNT()</xref></codeph>,
+ <codeph><xref href="impala_sum.xml#sum">SUM()</xref></codeph>,
+ <codeph><xref href="impala_avg.xml#avg">AVG()</xref></codeph>,
+ <codeph><xref href="impala_min.xml#min">MIN()</xref></codeph>, or
+ <codeph><xref href="impala_max.xml#max">MAX()</xref></codeph>, and typically with the
+ <codeph><xref href="impala_group_by.xml#group_by">GROUP BY</xref></codeph> clause also.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <p rev="2.0.0">
+ The filter expression in the <codeph>HAVING</codeph> clause cannot include a scalar subquery.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+ <p>
+ <xref href="impala_select.xml#select"/>,
+ <xref href="impala_group_by.xml#group_by"/>,
+ <xref href="impala_aggregate_functions.xml#aggregate_functions"/>
+ </p>
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_hbase_cache_blocks.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_hbase_cache_blocks.xml b/docs/topics/impala_hbase_cache_blocks.xml
new file mode 100644
index 0000000..d42cbf6
--- /dev/null
+++ b/docs/topics/impala_hbase_cache_blocks.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="hbase_cache_blocks">
+
+ <title>HBASE_CACHE_BLOCKS Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="HBase"/>
+ <data name="Category" value="Impala Query Options"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">HBASE_CACHE_BLOCKS query option</indexterm>
+ Setting this option is equivalent to calling the <codeph>setCacheBlocks</codeph> method of the class
+ <xref href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/client/Scan.html" scope="external" format="html">org.apache.hadoop.hbase.client.Scan</xref>,
+ in an HBase Java application. Helps to control the memory pressure on the HBase region server, in conjunction
+ with the <codeph>HBASE_CACHING</codeph> query option.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/type_boolean"/>
+ <p conref="../shared/impala_common.xml#common/default_false_0"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+ <p>
+ <xref href="impala_hbase.xml#impala_hbase"/>,
+ <xref href="impala_hbase_caching.xml#hbase_caching"/>
+ </p>
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_hbase_caching.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_hbase_caching.xml b/docs/topics/impala_hbase_caching.xml
new file mode 100644
index 0000000..e543792
--- /dev/null
+++ b/docs/topics/impala_hbase_caching.xml
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="hbase_caching">
+
+ <title>HBASE_CACHING Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="HBase"/>
+ <data name="Category" value="Impala Query Options"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">HBASE_CACHING query option</indexterm>
+ Setting this option is equivalent to calling the <codeph>setCaching</codeph> method of the class
+ <xref href="http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/client/Scan.html" scope="external" format="html">org.apache.hadoop.hbase.client.Scan</xref>,
+ in an HBase Java application. Helps to control the memory pressure on the HBase region server, in conjunction
+ with the <codeph>HBASE_CACHE_BLOCKS</codeph> query option.
+ </p>
+
+ <p>
+ <b>Type:</b> <codeph>BOOLEAN</codeph>
+ </p>
+
+ <p>
+ <b>Default:</b> 0
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+ <p>
+ <xref href="impala_hbase.xml#impala_hbase"/>,
+ <xref href="impala_hbase_cache_blocks.xml#hbase_cache_blocks"/>
+ </p>
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_hints.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_hints.xml b/docs/topics/impala_hints.xml
new file mode 100644
index 0000000..429fb19
--- /dev/null
+++ b/docs/topics/impala_hints.xml
@@ -0,0 +1,247 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="hints">
+
+ <title>Query Hints in Impala SELECT Statements</title>
+ <titlealts><navtitle>Hints</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Performance"/>
+ <data name="Category" value="Troubleshooting"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">hints</indexterm>
+ The Impala SQL dialect supports query hints, for fine-tuning the inner workings of queries. Specify hints as
+ a temporary workaround for expensive queries, where missing statistics or other factors cause inefficient
+ performance.
+ </p>
+
+ <p>
+ Hints are most often used for the most resource-intensive kinds of Impala queries:
+ </p>
+
+ <ul>
+ <li>
+ Join queries involving large tables, where intermediate result sets are transmitted across the network to
+ evaluate the join conditions.
+ </li>
+
+ <li>
+ Inserting into partitioned Parquet tables, where many memory buffers could be allocated on each host to
+ hold intermediate results for each partition.
+ </li>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+ <p>
+ You can represent the hints as keywords surrounded by <codeph>[]</codeph> square brackets; include the
+ brackets in the text of the SQL statement.
+ </p>
+
+<codeblock>SELECT STRAIGHT_JOIN <varname>select_list</varname> FROM
+<varname>join_left_hand_table</varname>
+ JOIN [{BROADCAST|SHUFFLE}]
+<varname>join_right_hand_table</varname>
+<varname>remainder_of_query</varname>;
+
+INSERT <varname>insert_clauses</varname>
+ [{SHUFFLE|NOSHUFFLE}]
+ SELECT <varname>remainder_of_query</varname>;
+</codeblock>
+
+ <p rev="2.0.0">
+ In Impala 2.0 and higher, or CDH 5.2 and higher, you can also specify the hints inside comments that use
+ either the <codeph>/* */</codeph> or <codeph>--</codeph> notation. Specify a <codeph>+</codeph> symbol
+ immediately before the hint name.
+ </p>
+
+<codeblock rev="2.0.0">SELECT STRAIGHT_JOIN <varname>select_list</varname> FROM
+<varname>join_left_hand_table</varname>
+ JOIN /* +BROADCAST|SHUFFLE */
+<varname>join_right_hand_table</varname>
+<varname>remainder_of_query</varname>;
+
+SELECT <varname>select_list</varname> FROM
+<varname>join_left_hand_table</varname>
+ JOIN -- +BROADCAST|SHUFFLE
+<varname>join_right_hand_table</varname>
+<varname>remainder_of_query</varname>;
+
+INSERT <varname>insert_clauses</varname>
+ /* +SHUFFLE|NOSHUFFLE */
+ SELECT <varname>remainder_of_query</varname>;
+
+INSERT <varname>insert_clauses</varname>
+ -- +SHUFFLE|NOSHUFFLE
+ SELECT <varname>remainder_of_query</varname>;
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ With both forms of hint syntax, include the <codeph>STRAIGHT_JOIN</codeph>
+ keyword immediately after the <codeph>SELECT</codeph> keyword to prevent Impala from
+ reordering the tables in a way that makes the hint ineffective.
+ </p>
+
+ <p>
+ To reduce the need to use hints, run the <codeph>COMPUTE STATS</codeph> statement against all tables involved
+ in joins, or used as the source tables for <codeph>INSERT ... SELECT</codeph> operations where the
+ destination is a partitioned Parquet table. Do this operation after loading data or making substantial
+ changes to the data within each table. Having up-to-date statistics helps Impala choose more efficient query
+ plans without the need for hinting. See <xref href="impala_perf_stats.xml#perf_stats"/> for details and
+ examples.
+ </p>
+
+ <p>
+ To see which join strategy is used for a particular query, examine the <codeph>EXPLAIN</codeph> output for
+ that query. See <xref href="impala_explain_plan.xml#perf_explain"/> for details and examples.
+ </p>
+
+ <p>
+ <b>Hints for join queries:</b>
+ </p>
+
+ <p>
+ The <codeph>[BROADCAST]</codeph> and <codeph>[SHUFFLE]</codeph> hints control the execution strategy for join
+ queries. Specify one of the following constructs immediately after the <codeph>JOIN</codeph> keyword in a
+ query:
+ </p>
+
+ <ul>
+ <li>
+ <codeph>[SHUFFLE]</codeph> - Makes that join operation use the <q>partitioned</q> technique, which divides
+ up corresponding rows from both tables using a hashing algorithm, sending subsets of the rows to other
+ nodes for processing. (The keyword <codeph>SHUFFLE</codeph> is used to indicate a <q>partitioned join</q>,
+ because that type of join is not related to <q>partitioned tables</q>.) Since the alternative
+ <q>broadcast</q> join mechanism is the default when table and index statistics are unavailable, you might
+ use this hint for queries where broadcast joins are unsuitable; typically, partitioned joins are more
+ efficient for joins between large tables of similar size.
+ </li>
+
+ <li>
+ <codeph>[BROADCAST]</codeph> - Makes that join operation use the <q>broadcast</q> technique that sends the
+ entire contents of the right-hand table to all nodes involved in processing the join. This is the default
+ mode of operation when table and index statistics are unavailable, so you would typically only need it if
+ stale metadata caused Impala to mistakenly choose a partitioned join operation. Typically, broadcast joins
+ are more efficient in cases where one table is much smaller than the other. (Put the smaller table on the
+ right side of the <codeph>JOIN</codeph> operator.)
+ </li>
+ </ul>
+
+ <p>
+ <b>Hints for INSERT ... SELECT queries:</b>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/insert_hints"/>
+
+ <p>
+ <b>Suggestions versus directives:</b>
+ </p>
+
+ <p>
+ In early Impala releases, hints were always obeyed and so acted more like directives. Once Impala gained join
+ order optimizations, sometimes join queries were automatically reordered in a way that made a hint
+ irrelevant. Therefore, the hints act more like suggestions in Impala 1.2.2 and higher.
+ </p>
+
+ <p>
+ To force Impala to follow the hinted execution mechanism for a join query, include the
+ <codeph>STRAIGHT_JOIN</codeph> keyword in the <codeph>SELECT</codeph> statement. See
+ <xref href="impala_perf_joins.xml#straight_join"/> for details. When you use this technique, Impala does not
+ reorder the joined tables at all, so you must be careful to arrange the join order to put the largest table
+ (or subquery result set) first, then the smallest, second smallest, third smallest, and so on. This ordering lets Impala do the
+ most I/O-intensive parts of the query using local reads on the data nodes, and then reduce the size of the
+ intermediate result set as much as possible as each subsequent table or subquery result set is joined.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <p>
+ Queries that include subqueries in the <codeph>WHERE</codeph> clause can be rewritten internally as join
+ queries. Currently, you cannot apply hints to the joins produced by these types of queries.
+ </p>
+
+ <p>
+ Because hints can prevent queries from taking advantage of new metadata or improvements in query planning,
+ use them only when required to work around performance issues, and be prepared to remove them when they are
+ no longer required, such as after a new Impala release or bug fix.
+ </p>
+
+ <p>
+ In particular, the <codeph>[BROADCAST]</codeph> and <codeph>[SHUFFLE]</codeph> hints are expected to be
+ needed much less frequently in Impala 1.2.2 and higher, because the join order optimization feature in
+ combination with the <codeph>COMPUTE STATS</codeph> statement now automatically choose join order and join
+ mechanism without the need to rewrite the query and add hints. See
+ <xref href="impala_perf_joins.xml#perf_joins"/> for details.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/compatibility_blurb"/>
+
+ <p rev="2.0.0">
+ The hints embedded within <codeph>--</codeph> comments are compatible with Hive queries. The hints embedded
+ within <codeph>/* */</codeph> comments or <codeph>[ ]</codeph> square brackets are not recognized by or not
+ compatible with Hive. For example, Hive raises an error for Impala hints within <codeph>/* */</codeph>
+ comments because it does not recognize the Impala hint names.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/view_blurb"/>
+
+ <p rev="2.0.0">
+ If you use a hint in the query that defines a view, the hint is preserved when you query the view. Impala
+ internally rewrites all hints in views to use the <codeph>--</codeph> comment notation, so that Hive can
+ query such views without errors due to unrecognized hint names.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ For example, this query joins a large customer table with a small lookup table of less than 100 rows. The
+ right-hand table can be broadcast efficiently to all nodes involved in the join. Thus, you would use the
+ <codeph>[broadcast]</codeph> hint to force a broadcast join strategy:
+ </p>
+
+<codeblock>select straight_join customer.address, state_lookup.state_name
+ from customer join <b>[broadcast]</b> state_lookup
+ on customer.state_id = state_lookup.state_id;</codeblock>
+
+ <p>
+ This query joins two large tables of unpredictable size. You might benchmark the query with both kinds of
+ hints and find that it is more efficient to transmit portions of each table to other nodes for processing.
+ Thus, you would use the <codeph>[shuffle]</codeph> hint to force a partitioned join strategy:
+ </p>
+
+<codeblock>select straight_join weather.wind_velocity, geospatial.altitude
+ from weather join <b>[shuffle]</b> geospatial
+ on weather.lat = geospatial.lat and weather.long = geospatial.long;</codeblock>
+
+ <p>
+ For joins involving three or more tables, the hint applies to the tables on either side of that specific
+ <codeph>JOIN</codeph> keyword. The <codeph>STRAIGHT_JOIN</codeph> keyword ensures that joins are processed
+ in a predictable order from left to right. For example, this query joins
+ <codeph>t1</codeph> and <codeph>t2</codeph> using a partitioned join, then joins that result set to
+ <codeph>t3</codeph> using a broadcast join:
+ </p>
+
+<codeblock>select straight_join t1.name, t2.id, t3.price
+ from t1 join <b>[shuffle]</b> t2 join <b>[broadcast]</b> t3
+ on t1.id = t2.id and t2.id = t3.id;</codeblock>
+
+ <draft-comment translate="no"> This is a good place to add more sample output showing before and after EXPLAIN plans. </draft-comment>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ For more background information about join queries, see <xref href="impala_joins.xml#joins"/>. For
+ performance considerations, see <xref href="impala_perf_joins.xml#perf_joins"/>.
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_identifiers.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_identifiers.xml b/docs/topics/impala_identifiers.xml
new file mode 100644
index 0000000..55477ed
--- /dev/null
+++ b/docs/topics/impala_identifiers.xml
@@ -0,0 +1,114 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="identifiers">
+
+ <title>Overview of Impala Identifiers</title>
+ <titlealts><navtitle>Identifiers</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Databases"/>
+ <data name="Category" value="Tables"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ Identifiers are the names of databases, tables, or columns that you specify in a SQL statement. The rules for
+ identifiers govern what names you can give to things you create, the notation for referring to names
+ containing unusual characters, and other aspects such as case sensitivity.
+ </p>
+
+ <ul>
+ <li>
+ <p>
+ The minimum length of an identifier is 1 character.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ The maximum length of an identifier is currently 128 characters, enforced by the metastore database.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ An identifier must start with an alphabetic character. The remainder can contain any combination of
+ alphanumeric characters and underscores. Quoting the identifier with backticks has no effect on the allowed
+ characters in the name.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ An identifier can contain only ASCII characters.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ To use an identifier name that matches one of the Impala reserved keywords (listed in
+ <xref href="impala_reserved_words.xml#reserved_words"/>), surround the identifier with <codeph>``</codeph>
+ characters (backticks). Quote the reserved word even if it is part of a fully qualified name.
+ The following example shows how a reserved word can be used as a column name if it is quoted
+ with backticks in the <codeph>CREATE TABLE</codeph> statement, and how the column name
+ must also be quoted with backticks in a query:
+ </p>
+<codeblock>[localhost:21000] > create table reserved (`data` string);
+
+[localhost:21000] > select data from reserved;
+ERROR: AnalysisException: Syntax error in line 1:
+select data from reserved
+ ^
+Encountered: DATA
+Expected: ALL, CASE, CAST, DISTINCT, EXISTS, FALSE, IF, INTERVAL, NOT, NULL, STRAIGHT_JOIN, TRUE, IDENTIFIER
+CAUSED BY: Exception: Syntax error
+
+[localhost:21000] > select reserved.data from reserved;
+ERROR: AnalysisException: Syntax error in line 1:
+select reserved.data from reserved
+ ^
+Encountered: DATA
+Expected: IDENTIFIER
+CAUSED BY: Exception: Syntax error
+
+[localhost:21000] > select reserved.`data` from reserved;
+
+[localhost:21000] >
+</codeblock>
+
+ <note type="important">
+ Because the list of reserved words grows over time as new SQL syntax is added,
+ consider adopting coding conventions (especially for any automated scripts
+ or in packaged applications) to always quote all identifiers with backticks.
+ Quoting all identifiers protects your SQL from compatibility issues if
+ new reserved words are added in later releases.
+ </note>
+
+ </li>
+
+ <li>
+ <p>
+ Impala identifiers are always case-insensitive. That is, tables named <codeph>t1</codeph> and
+ <codeph>T1</codeph> always refer to the same table, regardless of quote characters. Internally, Impala
+ always folds all specified table and column names to lowercase. This is why the column headers in query
+ output are always displayed in lowercase.
+ </p>
+ </li>
+ </ul>
+
+ <p>
+ See <xref href="impala_aliases.xml#aliases"/> for how to define shorter or easier-to-remember aliases if the
+ original names are long or cryptic identifiers.
+ <ph conref="../shared/impala_common.xml#common/aliases_vs_identifiers"/>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/views_vs_identifiers"/>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_insert.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_insert.xml b/docs/topics/impala_insert.xml
new file mode 100644
index 0000000..6d0f68b
--- /dev/null
+++ b/docs/topics/impala_insert.xml
@@ -0,0 +1,676 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="insert">
+
+ <title>INSERT Statement</title>
+ <titlealts><navtitle>INSERT</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="ETL"/>
+ <data name="Category" value="Ingest"/>
+ <data name="Category" value="DML"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Tables"/>
+ <data audience="impala_next" name="Category" value="Kudu"/>
+ <!-- This is such an important statement, think if there are more applicable categories. -->
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">INSERT statement</indexterm>
+ Impala supports inserting into tables and partitions that you create with the Impala <codeph>CREATE
+ TABLE</codeph> statement, or pre-defined tables and partitions created through Hive.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>[<varname>with_clause</varname>]
+INSERT { INTO | OVERWRITE } [TABLE] <varname>table_name</varname>
+ [(<varname>column_list</varname>)]
+ [ PARTITION (<varname>partition_clause</varname>)]
+{
+ [<varname>hint_clause</varname>] <varname>select_statement</varname>
+ | VALUES (<varname>value</varname> [, <varname>value</varname> ...]) [, (<varname>value</varname> [, <varname>value</varname> ...]) ...]
+}
+
+partition_clause ::= <varname>col_name</varname> [= <varname>constant</varname>] [, <varname>col_name</varname> [= <varname>constant</varname>] ...]
+
+hint_clause ::= [SHUFFLE] | [NOSHUFFLE] (Note: the square brackets are part of the syntax.)
+</codeblock>
+
+ <p>
+ <b>Appending or replacing (INTO and OVERWRITE clauses):</b>
+ </p>
+
+ <p>
+ The <codeph>INSERT INTO</codeph> syntax appends data to a table. The existing data files are left as-is, and
+ the inserted data is put into one or more new data files.
+ </p>
+
+ <p>
+ The <codeph>INSERT OVERWRITE</codeph> syntax replaces the data in a table.
+<!-- What happens with INSERT OVERWRITE if the target is a single partition or multiple partitions? -->
+<!-- If that gets too detailed, cover later under "Partitioning Considerations". -->
+ Currently, the overwritten data files are deleted immediately; they do not go through the HDFS trash
+ mechanism.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p rev="2.3.0">
+ The <codeph>INSERT</codeph> statement currently does not support writing data files
+ containing complex types (<codeph>ARRAY</codeph>, <codeph>STRUCT</codeph>, and <codeph>MAP</codeph>).
+ To prepare Parquet data for such tables, you generate the data files outside Impala and then
+ use <codeph>LOAD DATA</codeph> or <codeph>CREATE EXTERNAL TABLE</codeph> to associate those
+ data files with the table. Currently, such tables must use the Parquet file format.
+ See <xref href="impala_complex_types.xml#complex_types"/> for details about working with complex types.
+ </p>
+
+ <p rev="kudu" audience="impala_next">
+ <b>Ignoring duplicate partition keys for Kudu tables (IGNORE clause)</b>
+ </p>
+
+ <p rev="kudu" audience="impala_next">
+ Normally, an <codeph>INSERT</codeph> operation into a Kudu table fails if
+ it would result in duplicate partition key columns for any rows.
+ Specify <codeph>INSERT IGNORE <varname>rest_of_statement</varname></codeph> to
+ make the <codeph>INSERT</codeph> continue in this case. The rows that would
+ have duplicate partition key columns are not inserted.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ Impala currently supports:
+ </p>
+
+ <ul>
+ <li>
+ Copy data from another table using <codeph>SELECT</codeph> query. In Impala 1.2.1 and higher, you can
+ combine <codeph>CREATE TABLE</codeph> and <codeph>INSERT</codeph> operations into a single step with the
+ <codeph>CREATE TABLE AS SELECT</codeph> syntax, which bypasses the actual <codeph>INSERT</codeph> keyword.
+ </li>
+
+ <li>
+ An optional <xref href="impala_with.xml#with"><codeph>WITH</codeph> clause</xref> before the
+ <codeph>INSERT</codeph> keyword, to define a subquery referenced in the <codeph>SELECT</codeph> portion.
+ </li>
+
+ <li>
+ Create one or more new rows using constant expressions through <codeph>VALUES</codeph> clause. (The
+ <codeph>VALUES</codeph> clause was added in Impala 1.0.1.)
+ </li>
+
+ <li rev="1.1">
+ <p>
+ By default, the first column of each newly inserted row goes into the first column of the table, the
+ second column into the second column, and so on.
+ </p>
+ <p>
+ You can also specify the columns to be inserted, an arbitrarily ordered subset of the columns in the
+ destination table, by specifying a column list immediately after the name of the destination table. This
+ feature lets you adjust the inserted columns to match the layout of a <codeph>SELECT</codeph> statement,
+ rather than the other way around. (This feature was added in Impala 1.1.)
+ </p>
+ <p>
+ The number of columns mentioned in the column list (known as the <q>column permutation</q>) must match
+ the number of columns in the <codeph>SELECT</codeph> list or the <codeph>VALUES</codeph> tuples. The
+ order of columns in the column permutation can be different than in the underlying table, and the columns
+ of each input row are reordered to match. If the number of columns in the column permutation is less than
+ in the destination table, all unmentioned columns are set to <codeph>NULL</codeph>.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ For a partitioned table, the optional <codeph>PARTITION</codeph> clause identifies which partition or
+ partitions the new values go into. If a partition key column is given a constant value such as
+ <codeph>PARTITION (year=2012)</codeph> or <codeph>PARTITION (year=2012, month=2)</codeph>, all the
+ inserted rows use those same values for those partition key columns and you omit any corresponding
+ columns in the source table from the <codeph>SELECT</codeph> list. This form is known as <q>static
+ partitioning</q>.
+ </p>
+ <p>
+ If a partition key column is mentioned but not assigned a value, such as in <codeph>PARTITION (year,
+ region)</codeph> (both columns unassigned) or <codeph>PARTITION(year, region='CA')</codeph>
+ (<codeph>year</codeph> column unassigned), the unassigned columns are filled in with the final columns of
+ the <codeph>SELECT</codeph> list. In this case, the number of columns in the <codeph>SELECT</codeph> list
+ must equal the number of columns in the column permutation plus the number of partition key columns not
+ assigned a constant value. This form is known as <q>dynamic partitioning</q>.
+ </p>
+ <p>
+ See <xref href="impala_partitioning.xml#partition_static_dynamic"/> for examples and performance
+ characteristics of static and dynamic partitioned inserts.
+ </p>
+ </li>
+
+ <li rev="1.2.2">
+ An optional hint clause immediately before the <codeph>SELECT</codeph> keyword, to fine-tune the behavior
+ when doing an <codeph>INSERT ... SELECT</codeph> operation into partitioned Parquet tables. The hint
+ keywords are <codeph>[SHUFFLE]</codeph> and <codeph>[NOSHUFFLE]</codeph>, including the square brackets.
+ Inserting into partitioned Parquet tables can be a resource-intensive operation because it potentially
+ involves many files being written to HDFS simultaneously, and separate
+ <ph rev="parquet_block_size">large</ph> memory buffers being allocated to buffer the data for each
+ partition. For usage details, see <xref href="impala_parquet.xml#parquet_etl"/>.
+ </li>
+ </ul>
+
+ <note>
+ <ul>
+ <li>
+ Insert commands that partition or add files result in changes to Hive metadata. Because Impala uses Hive
+ metadata, such changes may necessitate a metadata refresh. For more information, see the
+ <xref href="impala_refresh.xml#refresh" format="dita">REFRESH</xref> function.
+ </li>
+
+ <li>
+ Currently, Impala can only insert data into tables that use the text and Parquet formats. For other file
+ formats, insert the data using Hive and use Impala to query it.
+ </li>
+
+ <li>
+ As an alternative to the <codeph>INSERT</codeph> statement, if you have existing data files elsewhere in
+ HDFS, the <codeph>LOAD DATA</codeph> statement can move those files into a table. This statement works
+ with tables of any file format.
+ </li>
+ </ul>
+ </note>
+
+ <p conref="../shared/impala_common.xml#common/dml_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ When you insert the results of an expression, particularly of a built-in function call, into a small numeric
+ column such as <codeph>INT</codeph>, <codeph>SMALLINT</codeph>, <codeph>TINYINT</codeph>, or
+ <codeph>FLOAT</codeph>, you might need to use a <codeph>CAST()</codeph> expression to coerce values into the
+ appropriate type. Impala does not automatically convert from a larger type to a smaller one. For example, to
+ insert cosine values into a <codeph>FLOAT</codeph> column, write <codeph>CAST(COS(angle) AS FLOAT)</codeph>
+ in the <codeph>INSERT</codeph> statement to make the conversion explicit.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/insert_parquet_blocksize"/>
+
+ <p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/>
+
+ <note conref="../shared/impala_common.xml#common/compute_stats_next"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example sets up new tables with the same definition as the <codeph>TAB1</codeph> table from the
+ <xref href="impala_tutorial.xml#tutorial" format="dita">Tutorial</xref> section, using different file
+ formats, and demonstrates inserting data into the tables created with the <codeph>STORED AS TEXTFILE</codeph>
+ and <codeph>STORED AS PARQUET</codeph> clauses:
+ </p>
+
+<codeblock>CREATE DATABASE IF NOT EXISTS file_formats;
+USE file_formats;
+
+DROP TABLE IF EXISTS text_table;
+CREATE TABLE text_table
+( id INT, col_1 BOOLEAN, col_2 DOUBLE, col_3 TIMESTAMP )
+STORED AS TEXTFILE;
+
+DROP TABLE IF EXISTS parquet_table;
+CREATE TABLE parquet_table
+( id INT, col_1 BOOLEAN, col_2 DOUBLE, col_3 TIMESTAMP )
+STORED AS PARQUET;</codeblock>
+
+ <p>
+ With the <codeph>INSERT INTO TABLE</codeph> syntax, each new set of inserted rows is appended to any existing
+ data in the table. This is how you would record small amounts of data that arrive continuously, or ingest new
+ batches of data alongside the existing data. For example, after running 2 <codeph>INSERT INTO TABLE</codeph>
+ statements with 5 rows each, the table contains 10 rows total:
+ </p>
+
+<codeblock>[localhost:21000] > insert into table text_table select * from default.tab1;
+Inserted 5 rows in 0.41s
+
+[localhost:21000] > insert into table text_table select * from default.tab1;
+Inserted 5 rows in 0.46s
+
+[localhost:21000] > select count(*) from text_table;
++----------+
+| count(*) |
++----------+
+| 10 |
++----------+
+Returned 1 row(s) in 0.26s</codeblock>
+
+ <p>
+ With the <codeph>INSERT OVERWRITE TABLE</codeph> syntax, each new set of inserted rows replaces any existing
+ data in the table. This is how you load data to query in a data warehousing scenario where you analyze just
+ the data for a particular day, quarter, and so on, discarding the previous data each time. You might keep the
+ entire set of data in one raw table, and transfer and transform certain rows into a more compact and
+ efficient form to perform intensive analysis on that subset.
+ </p>
+
+ <p>
+ For example, here we insert 5 rows into a table using the <codeph>INSERT INTO</codeph> clause, then replace
+ the data by inserting 3 rows with the <codeph>INSERT OVERWRITE</codeph> clause. Afterward, the table only
+ contains the 3 rows from the final <codeph>INSERT</codeph> statement.
+ </p>
+
+<codeblock>[localhost:21000] > insert into table parquet_table select * from default.tab1;
+Inserted 5 rows in 0.35s
+
+[localhost:21000] > insert overwrite table parquet_table select * from default.tab1 limit 3;
+Inserted 3 rows in 0.43s
+[localhost:21000] > select count(*) from parquet_table;
++----------+
+| count(*) |
++----------+
+| 3 |
++----------+
+Returned 1 row(s) in 0.43s</codeblock>
+
+ <p>
+ The <codeph><xref href="impala_insert.xml#values">VALUES</xref></codeph> clause lets you insert one or more
+ rows by specifying constant values for all the columns. The number, types, and order of the expressions must
+ match the table definition.
+ </p>
+
+ <note id="insert_values_warning">
+ The <codeph>INSERT ... VALUES</codeph> technique is not suitable for loading large quantities of data into
+ HDFS-based tables, because the insert operations cannot be parallelized, and each one produces a separate
+ data file. Use it for setting up small dimension tables or tiny amounts of data for experimenting with SQL
+ syntax, or with HBase tables. Do not use it for large ETL jobs or benchmark tests for load operations. Do not
+ run scripts with thousands of <codeph>INSERT ... VALUES</codeph> statements that insert a single row each
+ time. If you do run <codeph>INSERT ... VALUES</codeph> operations to load data into a staging table as one
+ stage in an ETL pipeline, include multiple row values if possible within each <codeph>VALUES</codeph> clause,
+ and use a separate database to make cleanup easier if the operation does produce many tiny files.
+ </note>
+
+ <p>
+ The following example shows how to insert one row or multiple rows, with expressions of different types,
+ using literal values, expressions, and function return values:
+ </p>
+
+<codeblock>create table val_test_1 (c1 int, c2 float, c3 string, c4 boolean, c5 timestamp);
+insert into val_test_1 values (100, 99.9/10, 'abc', true, now());
+create table val_test_2 (id int, token string);
+insert overwrite val_test_2 values (1, 'a'), (2, 'b'), (-1,'xyzzy');</codeblock>
+
+ <p>
+ These examples show the type of <q>not implemented</q> error that you see when attempting to insert data into
+ a table with a file format that Impala currently does not write to:
+ </p>
+
+<codeblock>DROP TABLE IF EXISTS sequence_table;
+CREATE TABLE sequence_table
+( id INT, col_1 BOOLEAN, col_2 DOUBLE, col_3 TIMESTAMP )
+STORED AS SEQUENCEFILE;
+
+DROP TABLE IF EXISTS rc_table;
+CREATE TABLE rc_table
+( id INT, col_1 BOOLEAN, col_2 DOUBLE, col_3 TIMESTAMP )
+STORED AS RCFILE;
+
+[localhost:21000] > insert into table rc_table select * from default.tab1;
+Remote error
+Backend 0:RC_FILE not implemented.
+
+[localhost:21000] > insert into table sequence_table select * from default.tab1;
+Remote error
+Backend 0:SEQUENCE_FILE not implemented. </codeblock>
+
+ <p>
+ Inserting data into partitioned tables requires slightly different syntax that divides the partitioning
+ columns from the others:
+ </p>
+
+<codeblock>create table t1 (i int) <b>partitioned by (x int, y string)</b>;
+-- Select an INT column from another table.
+-- All inserted rows will have the same x and y values, as specified in the INSERT statement.
+-- This technique of specifying all the partition key values is known as static partitioning.
+insert into t1 <b>partition(x=10, y='a')</b> select c1 from some_other_table;
+-- Select two INT columns from another table.
+-- All inserted rows will have the same y value, as specified in the INSERT statement.
+-- Values from c2 go into t1.x.
+-- Any partitioning columns whose value is not specified are filled in
+-- from the columns specified last in the SELECT list.
+-- This technique of omitting some partition key values is known as dynamic partitioning.
+insert into t1 <b>partition(x, y='b')</b> select c1, c2 from some_other_table;
+-- Select an INT and a STRING column from another table.
+-- All inserted rows will have the same x value, as specified in the INSERT statement.
+-- Values from c3 go into t1.y.
+insert into t1 <b>partition(x=20, y)</b> select c1, c3 from some_other_table;</codeblock>
+
+ <p rev="1.1">
+ The following examples show how you can copy the data in all the columns from one table to another, copy the
+ data from only some columns, or specify the columns in the select list in a different order than they
+ actually appear in the table:
+ </p>
+
+<codeblock>-- Start with 2 identical tables.
+create table t1 (c1 int, c2 int);
+create table t2 like t1;
+
+-- If there is no () part after the destination table name,
+-- all columns must be specified, either as * or by name.
+insert into t2 select * from t1;
+insert into t2 select c1, c2 from t1;
+
+-- With the () notation following the destination table name,
+-- you can omit columns (all values for that column are NULL
+-- in the destination table), and/or reorder the values
+-- selected from the source table. This is the "column permutation" feature.
+insert into t2 (c1) select c1 from t1;
+insert into t2 (c2, c1) select c1, c2 from t1;
+
+-- The column names can be entirely different in the source and destination tables.
+-- You can copy any columns, not just the corresponding ones, from the source table.
+-- But the number and type of selected columns must match the columns mentioned in the () part.
+alter table t2 replace columns (x int, y int);
+insert into t2 (y) select c1 from t1;
+
+-- For partitioned tables, all the partitioning columns must be mentioned in the () column list
+-- or a PARTITION clause; these columns cannot be defaulted to NULL.
+create table pt1 (x int, y int) partitioned by (z int);
+-- The values from c1 are copied into the column x in the new table,
+-- all in the same partition based on a constant value for z.
+-- The values of y in the new table are all NULL.
+insert into pt1 (x) partition (z=5) select c1 from t1;
+-- Again we omit the values for column y so they are all NULL.
+-- The inserted x values can go into different partitions, based on
+-- the different values inserted into the partitioning column z.
+insert into pt1 (x,z) select x, z from t2;
+</codeblock>
+
+ <p>
+ <codeph>SELECT *</codeph> for a partitioned table requires that all partition key columns in the source table
+ be declared as the last columns in the <codeph>CREATE TABLE</codeph> statement. You still include a
+ <codeph>PARTITION BY</codeph> clause listing all the partition key columns. These partition columns are
+ automatically mapped to the last columns from the <codeph>SELECT *</codeph> list.
+ </p>
+
+<codeblock>create table source (x int, y int, year int, month int, day int);
+create table destination (x int, y int) partitioned by (year int, month int, day int);
+...load some data into the unpartitioned source table...
+-- Insert a single partition of data.
+-- The SELECT * means you cannot specify partition (year=2014, month, day).
+insert overwrite destination partition (year, month, day) select * from source where year=2014;
+-- Insert the data for all year/month/day combinations.
+insert overwrite destination partition (year, month, day) select * from source;
+
+-- If one of the partition columns is omitted from the source table,
+-- then you can specify a specific value for that column in the PARTITION clause.
+-- Here the source table holds only data from 2014, and so does not include a year column.
+create table source_2014 (x int, y int, month, day);
+...load some data into the unpartitioned source_2014 table...
+insert overwrite destination partition (year=2014, month, day) select * from source_2014;
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/insert_sort_blurb"/>
+
+ <p>
+ <b>Concurrency considerations:</b> Each <codeph>INSERT</codeph> operation creates new data files with unique
+ names, so you can run multiple <codeph>INSERT INTO</codeph> statements simultaneously without filename
+ conflicts.
+<!--
+If data is inserted into a table by a statement issued to a different
+<cmdname>impalad</cmdname> node,
+issue a <codeph>REFRESH <varname>table_name</varname></codeph>
+statement to make the node you are connected to aware of this new data.
+-->
+ While data is being inserted into an Impala table, the data is staged temporarily in a subdirectory inside
+ the data directory; during this period, you cannot issue queries against that table in Hive. If an
+ <codeph>INSERT</codeph> operation fails, the temporary data file and the subdirectory could be left behind in
+ the data directory. If so, remove the relevant subdirectory and any data files it contains manually, by
+ issuing an <codeph>hdfs dfs -rm -r</codeph> command, specifying the full path of the work subdirectory, whose
+ name ends in <codeph>_dir</codeph>.
+ </p>
+ </conbody>
+
+ <concept id="values">
+
+ <title>VALUES Clause</title>
+
+ <conbody>
+
+ <p>
+ The <codeph>VALUES</codeph> clause is a general-purpose way to specify the columns of one or more rows,
+ typically within an <codeph><xref href="impala_insert.xml#insert">INSERT</xref></codeph> statement.
+ </p>
+
+ <note conref="../shared/impala_common.xml#common/insert_values_warning">
+ <p/>
+ </note>
+
+ <p>
+ The following examples illustrate:
+ </p>
+
+ <ul>
+ <li>
+ How to insert a single row using a <codeph>VALUES</codeph> clause.
+ </li>
+
+ <li>
+ How to insert multiple rows using a <codeph>VALUES</codeph> clause.
+ </li>
+
+ <li>
+ How the row or rows from a <codeph>VALUES</codeph> clause can be appended to a table through
+ <codeph>INSERT INTO</codeph>, or replace the contents of the table through <codeph>INSERT
+ OVERWRITE</codeph>.
+ </li>
+
+ <li>
+ How the entries in a <codeph>VALUES</codeph> clause can be literals, function results, or any other kind
+ of expression. See <xref href="impala_literals.xml#literals"/> for the notation to use for literal
+ values, especially <xref href="impala_literals.xml#string_literals"/> for quoting and escaping
+ conventions for strings. See <xref href="impala_operators.xml#operators"/> and
+ <xref href="impala_functions.xml#builtins"/> for other things you can include in expressions with the
+ <codeph>VALUES</codeph> clause.
+ </li>
+ </ul>
+
+<codeblock>[localhost:21000] > describe val_example;
+Query: describe val_example
+Query finished, fetching results ...
++-------+---------+---------+
+| name | type | comment |
++-------+---------+---------+
+| id | int | |
+| col_1 | boolean | |
+| col_2 | double | |
++-------+---------+---------+
+
+[localhost:21000] > insert into val_example values (1,true,100.0);
+Inserted 1 rows in 0.30s
+[localhost:21000] > select * from val_example;
++----+-------+-------+
+| id | col_1 | col_2 |
++----+-------+-------+
+| 1 | true | 100 |
++----+-------+-------+
+
+[localhost:21000] > insert overwrite val_example values (10,false,pow(2,5)), (50,true,10/3);
+Inserted 2 rows in 0.16s
+[localhost:21000] > select * from val_example;
++----+-------+-------------------+
+| id | col_1 | col_2 |
++----+-------+-------------------+
+| 10 | false | 32 |
+| 50 | true | 3.333333333333333 |
++----+-------+-------------------+</codeblock>
+
+ <p>
+ When used in an <codeph>INSERT</codeph> statement, the Impala <codeph>VALUES</codeph> clause can specify
+ some or all of the columns in the destination table, and the columns can be specified in a different order
+ than they actually appear in the table. To specify a different set or order of columns than in the table,
+ use the syntax:
+ </p>
+
+<codeblock>INSERT INTO <varname>destination</varname>
+ (<varname>col_x</varname>, <varname>col_y</varname>, <varname>col_z</varname>)
+ VALUES
+ (<varname>val_x</varname>, <varname>val_y</varname>, <varname>val_z</varname>);
+</codeblock>
+
+ <p>
+ Any columns in the table that are not listed in the <codeph>INSERT</codeph> statement are set to
+ <codeph>NULL</codeph>.
+ </p>
+
+<!--
+ <p>
+ does not support specifying a subset of the
+ columns in the table or specifying the columns in a different order. Use a
+ <codeph>VALUES</codeph> clause with all the column values in the same order as
+ the table definition, using <codeph>NULL</codeph> values for any columns you
+ want to omit from the <codeph>INSERT</codeph> operation.
+ </p>
+-->
+
+ <p>
+ To use a <codeph>VALUES</codeph> clause like a table in other statements, wrap it in parentheses and use
+ <codeph>AS</codeph> clauses to specify aliases for the entire object and any columns you need to refer to:
+ </p>
+
+<codeblock>[localhost:21000] > select * from (values(4,5,6),(7,8,9)) as t;
++---+---+---+
+| 4 | 5 | 6 |
++---+---+---+
+| 4 | 5 | 6 |
+| 7 | 8 | 9 |
++---+---+---+
+[localhost:21000] > select * from (values(1 as c1, true as c2, 'abc' as c3),(100,false,'xyz')) as t;
++-----+-------+-----+
+| c1 | c2 | c3 |
++-----+-------+-----+
+| 1 | true | abc |
+| 100 | false | xyz |
++-----+-------+-----+</codeblock>
+
+ <p>
+ For example, you might use a tiny table constructed like this from constant literals or function return
+ values as part of a longer statement involving joins or <codeph>UNION ALL</codeph>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/hdfs_blurb"/>
+
+ <p>
+ Impala physically writes all inserted files under the ownership of its default user, typically
+ <codeph>impala</codeph>. Therefore, this user must have HDFS write permission in the corresponding table
+ directory.
+ </p>
+
+ <p>
+ The permission requirement is independent of the authorization performed by the Sentry framework. (If the
+ connected user is not authorized to insert into a table, Sentry blocks that operation immediately,
+ regardless of the privileges available to the <codeph>impala</codeph> user.) Files created by Impala are
+ not owned by and do not inherit permissions from the connected user.
+ </p>
+
+ <p>
+ The number of data files produced by an <codeph>INSERT</codeph> statement depends on the size of the
+ cluster, the number of data blocks that are processed, the partition key columns in a partitioned table,
+ and the mechanism Impala uses for dividing the work in parallel. Do not assume that an
+ <codeph>INSERT</codeph> statement will produce some particular number of output files. In case of
+ performance issues with data written by Impala, check that the output files do not suffer from issues such
+ as many tiny files or many tiny partitions. (In the Hadoop context, even files or partitions of a few tens
+ of megabytes are considered <q>tiny</q>.)
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/insert_hidden_work_directory"/>
+
+ <p conref="../shared/impala_common.xml#common/hbase_blurb"/>
+
+ <p>
+ You can use the <codeph>INSERT</codeph> statement with HBase tables as follows:
+ </p>
+
+ <ul>
+ <li>
+ <p>
+ You can insert a single row or a small set of rows into an HBase table with the <codeph>INSERT ...
+ VALUES</codeph> syntax. This is a good use case for HBase tables with Impala, because HBase tables are
+ not subject to the same kind of fragmentation from many small insert operations as HDFS tables are.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ You can insert any number of rows at once into an HBase table using the <codeph>INSERT ...
+ SELECT</codeph> syntax.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ If more than one inserted row has the same value for the HBase key column, only the last inserted row
+ with that value is visible to Impala queries. You can take advantage of this fact with <codeph>INSERT
+ ... VALUES</codeph> statements to effectively update rows one at a time, by inserting new rows with the
+ same key values as existing rows. Be aware that after an <codeph>INSERT ... SELECT</codeph> operation
+ copying from an HDFS table, the HBase table might contain fewer rows than were inserted, if the key
+ column in the source table contained duplicate values.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ You cannot <codeph>INSERT OVERWRITE</codeph> into an HBase table. New rows are always appended.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ When you create an Impala or Hive table that maps to an HBase table, the column order you specify with
+ the <codeph>INSERT</codeph> statement might be different than the order you declare with the
+ <codeph>CREATE TABLE</codeph> statement. Behind the scenes, HBase arranges the columns based on how
+ they are divided into column families. This might cause a mismatch during insert operations, especially
+ if you use the syntax <codeph>INSERT INTO <varname>hbase_table</varname> SELECT * FROM
+ <varname>hdfs_table</varname></codeph>. Before inserting data, verify the column order by issuing a
+ <codeph>DESCRIBE</codeph> statement for the table, and adjust the order of the select list in the
+ <codeph>INSERT</codeph> statement.
+ </p>
+ </li>
+ </ul>
+
+ <p>
+ See <xref href="impala_hbase.xml#impala_hbase"/> for more details about using Impala with HBase.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/s3_blurb"/>
+ <p conref="../shared/impala_common.xml#common/s3_dml"/>
+
+ <p conref="../shared/impala_common.xml#common/security_blurb"/>
+ <p conref="../shared/impala_common.xml#common/redaction_yes"/>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_yes"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
+ <p rev="CDH-19187">
+ The user ID that the <cmdname>impalad</cmdname> daemon runs under,
+ typically the <codeph>impala</codeph> user, must have read
+ permission for the files in the source directory of an <codeph>INSERT ... SELECT</codeph>
+ operation, and write permission for all affected directories in the destination table.
+ (An <codeph>INSERT</codeph> operation could write files to multiple different HDFS directories
+ if the destination table is partitioned.)
+ This user must also have write permission to create a temporary work directory
+ in the top-level HDFS directory of the destination table.
+ An <codeph>INSERT OVERWRITE</codeph> operation does not require write permission on
+ the original data files in the table, only on the table directories themselves.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/char_varchar_cast_from_string"/>
+
+ <p conref="../shared/impala_common.xml#common/related_options"/>
+
+ <p rev="1.3.1" conref="../shared/impala_common.xml#common/insert_inherit_permissions"/>
+ </conbody>
+ </concept>
+
+<!-- Values clause -->
+</concept>
+<!-- INSERT statement -->
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_int.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_int.xml b/docs/topics/impala_int.xml
new file mode 100644
index 0000000..514d377
--- /dev/null
+++ b/docs/topics/impala_int.xml
@@ -0,0 +1,95 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="int">
+
+ <title>INT Data Type</title>
+ <titlealts><navtitle>INT</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Data Types"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ A 4-byte integer data type used in <codeph>CREATE TABLE</codeph> and <codeph>ALTER TABLE</codeph> statements.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+ <p>
+ In the column definition of a <codeph>CREATE TABLE</codeph> statement:
+ </p>
+
+<codeblock><varname>column_name</varname> INT</codeblock>
+
+ <p>
+ <b>Range:</b> -2147483648 .. 2147483647. There is no <codeph>UNSIGNED</codeph> subtype.
+ </p>
+
+ <p>
+ <b>Conversions:</b> Impala automatically converts to a larger integer type (<codeph>BIGINT</codeph>) or a
+ floating-point type (<codeph>FLOAT</codeph> or <codeph>DOUBLE</codeph>) automatically. Use
+ <codeph>CAST()</codeph> to convert to <codeph>TINYINT</codeph>, <codeph>SMALLINT</codeph>,
+ <codeph>STRING</codeph>, or <codeph>TIMESTAMP</codeph>.
+ <ph conref="../shared/impala_common.xml#common/cast_int_to_timestamp"/>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ The data type <codeph>INTEGER</codeph> is an alias for <codeph>INT</codeph>.
+ </p>
+
+ <p>
+ For a convenient and automated way to check the bounds of the <codeph>INT</codeph> type, call the functions
+ <codeph>MIN_INT()</codeph> and <codeph>MAX_INT()</codeph>.
+ </p>
+
+ <p>
+ If an integer value is too large to be represented as a <codeph>INT</codeph>, use a <codeph>BIGINT</codeph>
+ instead.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/null_bad_numeric_cast"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>CREATE TABLE t1 (x INT);
+SELECT CAST(1000 AS INT);
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/partitioning_good"/>
+
+ <p conref="../shared/impala_common.xml#common/hbase_ok"/>
+
+ <p conref="../shared/impala_common.xml#common/parquet_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/text_bulky"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/compatibility_blurb"/> -->
+
+ <p conref="../shared/impala_common.xml#common/internals_4_bytes"/>
+
+ <p conref="../shared/impala_common.xml#common/added_forever"/>
+
+ <p conref="../shared/impala_common.xml#common/column_stats_constant"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/restrictions_blurb"/> -->
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_literals.xml#numeric_literals"/>, <xref href="impala_tinyint.xml#tinyint"/>,
+ <xref href="impala_smallint.xml#smallint"/>, <xref href="impala_int.xml#int"/>,
+ <xref href="impala_bigint.xml#bigint"/>, <xref href="impala_decimal.xml#decimal"/>,
+ <xref href="impala_math_functions.xml#math_functions"/>
+ </p>
+ </conbody>
+</concept>
[12/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Posted by jr...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_double.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_double.xml b/docs/topics/impala_double.xml
new file mode 100644
index 0000000..f1d1756
--- /dev/null
+++ b/docs/topics/impala_double.xml
@@ -0,0 +1,100 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="double">
+
+ <title>DOUBLE Data Type</title>
+ <titlealts><navtitle>DOUBLE</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Data Types"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ A double precision floating-point data type used in <codeph>CREATE TABLE</codeph> and <codeph>ALTER
+ TABLE</codeph> statements.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+ <p>
+ In the column definition of a <codeph>CREATE TABLE</codeph> statement:
+ </p>
+
+<codeblock><varname>column_name</varname> DOUBLE</codeblock>
+
+ <p>
+ <b>Range:</b> 4.94065645841246544e-324d .. 1.79769313486231570e+308, positive or negative
+ </p>
+
+ <p>
+ <b>Precision:</b> 15 to 17 significant digits, depending on usage. The number of significant digits does
+ not depend on the position of the decimal point.
+ </p>
+
+ <p>
+ <b>Representation:</b> The values are stored in 8 bytes, using
+ <xref href="https://en.wikipedia.org/wiki/Double-precision_floating-point_format" scope="external" format="html">IEEE 754 Double Precision Binary Floating Point</xref> format.
+ </p>
+
+ <p>
+ <b>Conversions:</b> Impala does not automatically convert <codeph>DOUBLE</codeph> to any other type. You can
+ use <codeph>CAST()</codeph> to convert <codeph>DOUBLE</codeph> values to <codeph>FLOAT</codeph>,
+ <codeph>TINYINT</codeph>, <codeph>SMALLINT</codeph>, <codeph>INT</codeph>, <codeph>BIGINT</codeph>,
+ <codeph>STRING</codeph>, <codeph>TIMESTAMP</codeph>, or <codeph>BOOLEAN</codeph>. You can use exponential
+ notation in <codeph>DOUBLE</codeph> literals or when casting from <codeph>STRING</codeph>, for example
+ <codeph>1.0e6</codeph> to represent one million.
+ <ph conref="../shared/impala_common.xml#common/cast_int_to_timestamp"/>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ The data type <codeph>REAL</codeph> is an alias for <codeph>DOUBLE</codeph>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>CREATE TABLE t1 (x DOUBLE);
+SELECT CAST(1000.5 AS DOUBLE);
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/partitioning_imprecise"/>
+
+ <p conref="../shared/impala_common.xml#common/hbase_ok"/>
+
+ <p conref="../shared/impala_common.xml#common/parquet_ok"/>
+
+ <p conref="../shared/impala_common.xml#common/text_bulky"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/compatibility_blurb"/> -->
+
+ <p conref="../shared/impala_common.xml#common/internals_8_bytes"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/added_in_20"/> -->
+
+ <p conref="../shared/impala_common.xml#common/column_stats_constant"/>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+<!-- This conref appears under SUM(), AVG(), FLOAT, and DOUBLE topics. -->
+
+ <p conref="../shared/impala_common.xml#common/sum_double"/>
+
+ <p conref="../shared/impala_common.xml#common/float_double_decimal_caveat"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_literals.xml#numeric_literals"/>, <xref href="impala_math_functions.xml#math_functions"/>,
+ <xref href="impala_float.xml#float"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_drop_database.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_drop_database.xml b/docs/topics/impala_drop_database.xml
new file mode 100644
index 0000000..c6a1b64
--- /dev/null
+++ b/docs/topics/impala_drop_database.xml
@@ -0,0 +1,124 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="drop_database">
+
+ <title>DROP DATABASE Statement</title>
+ <titlealts><navtitle>DROP DATABASE</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Databases"/>
+ <data name="Category" value="DDL"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">DROP DATABASE statement</indexterm>
+ Removes a database from the system. The physical operations involve removing the metadata for the database
+ from the metastore, and deleting the corresponding <codeph>*.db</codeph> directory from HDFS.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>DROP (DATABASE|SCHEMA) [IF EXISTS] <varname>database_name</varname> <ph rev="2.3.0">[RESTRICT | CASCADE]</ph>;</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/ddl_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ By default, the database must be empty before it can be dropped, to avoid losing any data.
+ </p>
+
+ <p rev="2.3.0">
+ In CDH 5.5 / Impala 2.3 and higher, you can include the <codeph>CASCADE</codeph>
+ clause to make Impala drop all tables and other objects in the database before dropping the database itself.
+ The <codeph>RESTRICT</codeph> clause enforces the original requirement that the database be empty
+ before being dropped. Because the <codeph>RESTRICT</codeph> behavior is still the default, this
+ clause is optional.
+ </p>
+
+ <p rev="2.3.0">
+ The automatic dropping resulting from the <codeph>CASCADE</codeph> clause follows the same rules as the
+ corresponding <codeph>DROP TABLE</codeph>, <codeph>DROP VIEW</codeph>, and <codeph>DROP FUNCTION</codeph> statements.
+ In particular, the HDFS directories and data files for any external tables are left behind when the
+ tables are removed.
+ </p>
+
+ <p>
+ When you do not use the <codeph>CASCADE</codeph> clause, drop or move all the objects inside the database manually
+ before dropping the database itself:
+ </p>
+
+ <ul>
+ <li>
+ <p>
+ Use the <codeph>SHOW TABLES</codeph> statement to locate all tables and views in the database,
+ and issue <codeph>DROP TABLE</codeph> and <codeph>DROP VIEW</codeph> statements to remove them all.
+ </p>
+ </li>
+ <li>
+ <p>
+ Use the <codeph>SHOW FUNCTIONS</codeph> and <codeph>SHOW AGGREGATE FUNCTIONS</codeph> statements
+ to locate all user-defined functions in the database, and issue <codeph>DROP FUNCTION</codeph>
+ and <codeph>DROP AGGREGATE FUNCTION</codeph> statements to remove them all.
+ </p>
+ </li>
+ <li>
+ <p>
+ To keep tables or views contained by a database while removing the database itself, use
+ <codeph>ALTER TABLE</codeph> and <codeph>ALTER VIEW</codeph> to move the relevant
+ objects to a different database before dropping the original database.
+ </p>
+ </li>
+ </ul>
+
+ <p>
+ You cannot drop the current database, that is, the database your session connected to
+ either through the <codeph>USE</codeph> statement or the <codeph>-d</codeph> option of <cmdname>impala-shell</cmdname>.
+ Issue a <codeph>USE</codeph> statement to switch to a different database first.
+ Because the <codeph>default</codeph> database is always available, issuing
+ <codeph>USE default</codeph> is a convenient way to leave the current database
+ before dropping it.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/hive_blurb"/>
+
+ <p>
+ When you drop a database in Impala, the database can no longer be used by Hive.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<!-- Better to conref the same examples in both places. -->
+
+ <p>
+ See <xref href="impala_create_database.xml#create_database"/> for examples covering <codeph>CREATE
+ DATABASE</codeph>, <codeph>USE</codeph>, and <codeph>DROP DATABASE</codeph>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
+ <p rev="CDH-19187">
+ The user ID that the <cmdname>impalad</cmdname> daemon runs under,
+ typically the <codeph>impala</codeph> user, must have write
+ permission for the directory associated with the database.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <codeblock conref="../shared/impala_common.xml#common/create_drop_db_example"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_databases.xml#databases"/>, <xref href="impala_create_database.xml#create_database"/>,
+ <xref href="impala_use.xml#use"/>, <xref href="impala_show.xml#show_databases"/>, <xref href="impala_drop_table.xml#drop_table"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_drop_function.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_drop_function.xml b/docs/topics/impala_drop_function.xml
new file mode 100644
index 0000000..51a4d90
--- /dev/null
+++ b/docs/topics/impala_drop_function.xml
@@ -0,0 +1,60 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.2" id="drop_function">
+
+ <title>DROP FUNCTION Statement</title>
+ <titlealts><navtitle>DROP FUNCTION</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="DDL"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="UDFs"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">DROP FUNCTION statement</indexterm>
+ Removes a user-defined function (UDF), so that it is not available for execution during Impala
+ <codeph>SELECT</codeph> or <codeph>INSERT</codeph> operations.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>DROP [AGGREGATE] FUNCTION [IF EXISTS] [<varname>db_name</varname>.]<varname>function_name</varname>(<varname>type</varname>[, <varname>type</varname>...])</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/ddl_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ Because the same function name could be overloaded with different argument signatures, you specify the
+ argument types to identify the exact function to drop.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/udf_persistence_restriction"/>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
+ <p rev="CDH-19187">
+ The user ID that the <cmdname>impalad</cmdname> daemon runs under,
+ typically the <codeph>impala</codeph> user, does not need any
+ particular HDFS permissions to perform this statement.
+ All read and write operations are on the metastore database,
+ not HDFS files and directories.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_udf.xml#udfs"/>, <xref href="impala_create_function.xml#create_function"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_drop_role.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_drop_role.xml b/docs/topics/impala_drop_role.xml
new file mode 100644
index 0000000..35d2157
--- /dev/null
+++ b/docs/topics/impala_drop_role.xml
@@ -0,0 +1,67 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.4.0" id="drop_role">
+
+ <title>DROP ROLE Statement (CDH 5.2 or higher only)</title>
+ <titlealts><navtitle>DROP ROLE (CDH 5.2 or higher only)</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="DDL"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Sentry"/>
+ <data name="Category" value="Roles"/>
+ <!-- Consider whether to go deeper into categories like Security for the Sentry-related statements. -->
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">DROP ROLE statement</indexterm>
+<!-- Copied from Sentry docs. Turn into conref. I did some rewording for clarity. -->
+ The <codeph>DROP ROLE</codeph> statement removes a role from the metastore database. Once dropped, the role
+ is revoked for all users to whom it was previously assigned, and all privileges granted to that role are
+ revoked. Queries that are already executing are not affected. Impala verifies the role information
+ approximately every 60 seconds, so the effects of <codeph>DROP ROLE</codeph> might not take effect for new
+ Impala queries for a brief period.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>DROP ROLE <varname>role_name</varname>
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/privileges_blurb"/>
+
+ <p>
+ Only administrative users (initially, a predefined set of users specified in the Sentry service configuration
+ file) can use this statement.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/compatibility_blurb"/>
+
+ <p>
+ Impala makes use of any roles and privileges specified by the <codeph>GRANT</codeph> and
+ <codeph>REVOKE</codeph> statements in Hive, and Hive makes use of any roles and privileges specified by the
+ <codeph>GRANT</codeph> and <codeph>REVOKE</codeph> statements in Impala. The Impala <codeph>GRANT</codeph>
+ and <codeph>REVOKE</codeph> statements for privileges do not require the <codeph>ROLE</codeph> keyword to be
+ repeated before each role name, unlike the equivalent Hive statements.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_authorization.xml#authorization"/>, <xref href="impala_grant.xml#grant"/>
+ <xref href="impala_revoke.xml#revoke"/>, <xref href="impala_create_role.xml#create_role"/>,
+ <xref href="impala_show.xml#show"/>
+ </p>
+
+<!-- To do: nail down the new SHOW syntax, e.g. SHOW ROLES, SHOW CURRENT ROLES, SHOW GROUPS. -->
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/>
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_drop_stats.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_drop_stats.xml b/docs/topics/impala_drop_stats.xml
new file mode 100644
index 0000000..56697f4
--- /dev/null
+++ b/docs/topics/impala_drop_stats.xml
@@ -0,0 +1,275 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="2.1.0" id="drop_stats" xml:lang="en-US">
+
+ <title>DROP STATS Statement</title>
+ <titlealts><navtitle>DROP STATS</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="DDL"/>
+ <data name="Category" value="Tables"/>
+ <data name="Category" value="Performance"/>
+ <data name="Category" value="Scalability"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">DROP STATS statement</indexterm>
+ Removes the specified statistics from a table or partition. The statistics were originally created by the
+ <codeph>COMPUTE STATS</codeph> or <codeph>COMPUTE INCREMENTAL STATS</codeph> statement.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock rev="2.1.0">DROP STATS [<varname>database_name</varname>.]<varname>table_name</varname>
+DROP INCREMENTAL STATS [<varname>database_name</varname>.]<varname>table_name</varname> PARTITION (<varname>partition_spec</varname>)
+
+<varname>partition_spec</varname> ::= <varname>partition_col</varname>=<varname>constant_value</varname>
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/incremental_partition_spec"/>
+
+ <p>
+ <codeph>DROP STATS</codeph> removes all statistics from the table, whether created by <codeph>COMPUTE
+ STATS</codeph> or <codeph>COMPUTE INCREMENTAL STATS</codeph>.
+ </p>
+
+ <p rev="2.1.0">
+ <codeph>DROP INCREMENTAL STATS</codeph> only affects incremental statistics for a single partition, specified
+ through the <codeph>PARTITION</codeph> clause. The incremental stats are marked as outdated, so that they are
+ recomputed by the next <codeph>COMPUTE INCREMENTAL STATS</codeph> statement.
+ </p>
+
+<!-- To do: what release was this added in? -->
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ You typically use this statement when the statistics for a table or a partition have become stale due to data
+ files being added to or removed from the associated HDFS data directories, whether by manual HDFS operations
+ or <codeph>INSERT</codeph>, <codeph>INSERT OVERWRITE</codeph>, or <codeph>LOAD DATA</codeph> statements, or
+ adding or dropping partitions.
+ </p>
+
+ <p>
+ When a table or partition has no associated statistics, Impala treats it as essentially zero-sized when
+ constructing the execution plan for a query. In particular, the statistics influence the order in which
+ tables are joined in a join query. To ensure proper query planning and good query performance and
+ scalability, make sure to run <codeph>COMPUTE STATS</codeph> or <codeph>COMPUTE INCREMENTAL STATS</codeph> on
+ the table or partition after removing any stale statistics.
+ </p>
+
+ <p>
+ Dropping the statistics is not required for an unpartitioned table or a partitioned table covered by the
+ original type of statistics. A subsequent <codeph>COMPUTE STATS</codeph> statement replaces any existing
+ statistics with new ones, for all partitions, regardless of whether the old ones were outdated. Therefore,
+ this statement was rarely used before the introduction of incremental statistics.
+ </p>
+
+ <p>
+ Dropping the statistics is required for a partitioned table containing incremental statistics, to make a
+ subsequent <codeph>COMPUTE INCREMENTAL STATS</codeph> statement rescan an existing partition. See
+ <xref href="impala_perf_stats.xml#perf_stats"/> for information about incremental statistics, a new feature
+ available in Impala 2.1.0 and higher.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/ddl_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
+ <p rev="CDH-19187">
+ The user ID that the <cmdname>impalad</cmdname> daemon runs under,
+ typically the <codeph>impala</codeph> user, does not need any
+ particular HDFS permissions to perform this statement.
+ All read and write operations are on the metastore database,
+ not HDFS files and directories.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example shows a partitioned table that has associated statistics produced by the
+ <codeph>COMPUTE INCREMENTAL STATS</codeph> statement, and how the situation evolves as statistics are dropped
+ from specific partitions, then the entire table.
+ </p>
+
+ <p>
+ Initially, all table and column statistics are filled in.
+ </p>
+
+<!-- Note: chopped off any excess characters at position 87 and after,
+ to avoid weird wrapping in PDF.
+ Applies to any subsequent examples with output from SHOW ... STATS too. -->
+
+<codeblock>show table stats item_partitioned;
++-------------+-------+--------+----------+--------------+---------+------------------
+| i_category | #Rows | #Files | Size | Bytes Cached | Format | Incremental stats
++-------------+-------+--------+----------+--------------+---------+------------------
+| Books | 1733 | 1 | 223.74KB | NOT CACHED | PARQUET | true
+| Children | 1786 | 1 | 230.05KB | NOT CACHED | PARQUET | true
+| Electronics | 1812 | 1 | 232.67KB | NOT CACHED | PARQUET | true
+| Home | 1807 | 1 | 232.56KB | NOT CACHED | PARQUET | true
+| Jewelry | 1740 | 1 | 223.72KB | NOT CACHED | PARQUET | true
+| Men | 1811 | 1 | 231.25KB | NOT CACHED | PARQUET | true
+| Music | 1860 | 1 | 237.90KB | NOT CACHED | PARQUET | true
+| Shoes | 1835 | 1 | 234.90KB | NOT CACHED | PARQUET | true
+| Sports | 1783 | 1 | 227.97KB | NOT CACHED | PARQUET | true
+| Women | 1790 | 1 | 226.27KB | NOT CACHED | PARQUET | true
+| Total | 17957 | 10 | 2.25MB | 0B | |
++-------------+-------+--------+----------+--------------+---------+------------------
+show column stats item_partitioned;
++------------------+-----------+------------------+--------+----------+---------------
+| Column | Type | #Distinct Values | #Nulls | Max Size | Avg Size
++------------------+-----------+------------------+--------+----------+---------------
+| i_item_sk | INT | 19443 | -1 | 4 | 4
+| i_item_id | STRING | 9025 | -1 | 16 | 16
+| i_rec_start_date | TIMESTAMP | 4 | -1 | 16 | 16
+| i_rec_end_date | TIMESTAMP | 3 | -1 | 16 | 16
+| i_item_desc | STRING | 13330 | -1 | 200 | 100.3028030395
+| i_current_price | FLOAT | 2807 | -1 | 4 | 4
+| i_wholesale_cost | FLOAT | 2105 | -1 | 4 | 4
+| i_brand_id | INT | 965 | -1 | 4 | 4
+| i_brand | STRING | 725 | -1 | 22 | 16.17760086059
+| i_class_id | INT | 16 | -1 | 4 | 4
+| i_class | STRING | 101 | -1 | 15 | 7.767499923706
+| i_category_id | INT | 10 | -1 | 4 | 4
+| i_manufact_id | INT | 1857 | -1 | 4 | 4
+| i_manufact | STRING | 1028 | -1 | 15 | 11.32950019836
+| i_size | STRING | 8 | -1 | 11 | 4.334599971771
+| i_formulation | STRING | 12884 | -1 | 20 | 19.97999954223
+| i_color | STRING | 92 | -1 | 10 | 5.380899906158
+| i_units | STRING | 22 | -1 | 7 | 4.186900138854
+| i_container | STRING | 2 | -1 | 7 | 6.992599964141
+| i_manager_id | INT | 105 | -1 | 4 | 4
+| i_product_name | STRING | 19094 | -1 | 25 | 18.02330017089
+| i_category | STRING | 10 | 0 | -1 | -1
++------------------+-----------+------------------+--------+----------+---------------
+</codeblock>
+
+ <p>
+ To remove statistics for particular partitions, use the <codeph>DROP INCREMENTAL STATS</codeph> statement.
+ After removing statistics for two partitions, the table-level statistics reflect that change in the
+ <codeph>#Rows</codeph> and <codeph>Incremental stats</codeph> fields. The counts, maximums, and averages of
+ the column-level statistics are unaffected.
+ </p>
+
+ <note>
+ (It is possible that the row count might be preserved in future after a <codeph>DROP INCREMENTAL
+ STATS</codeph> statement. Check the resolution of the issue
+ <xref href="https://issues.cloudera.org/browse/IMPALA-1615" scope="external" format="html">IMPALA-1615</xref>.)
+ </note>
+
+<codeblock>drop incremental stats item_partitioned partition (i_category='Sports');
+drop incremental stats item_partitioned partition (i_category='Electronics');
+
+show table stats item_partitioned
++-------------+-------+--------+----------+--------------+---------+------------------
+| i_category | #Rows | #Files | Size | Bytes Cached | Format | Incremental stats
++-------------+-------+--------+----------+--------------+---------+------------------
+| Books | 1733 | 1 | 223.74KB | NOT CACHED | PARQUET | true
+| Children | 1786 | 1 | 230.05KB | NOT CACHED | PARQUET | true
+| Electronics | -1 | 1 | 232.67KB | NOT CACHED | PARQUET | false
+| Home | 1807 | 1 | 232.56KB | NOT CACHED | PARQUET | true
+| Jewelry | 1740 | 1 | 223.72KB | NOT CACHED | PARQUET | true
+| Men | 1811 | 1 | 231.25KB | NOT CACHED | PARQUET | true
+| Music | 1860 | 1 | 237.90KB | NOT CACHED | PARQUET | true
+| Shoes | 1835 | 1 | 234.90KB | NOT CACHED | PARQUET | true
+| Sports | -1 | 1 | 227.97KB | NOT CACHED | PARQUET | false
+| Women | 1790 | 1 | 226.27KB | NOT CACHED | PARQUET | true
+| Total | 17957 | 10 | 2.25MB | 0B | |
++-------------+-------+--------+----------+--------------+---------+------------------
+show column stats item_partitioned
++------------------+-----------+------------------+--------+----------+---------------
+| Column | Type | #Distinct Values | #Nulls | Max Size | Avg Size
++------------------+-----------+------------------+--------+----------+---------------
+| i_item_sk | INT | 19443 | -1 | 4 | 4
+| i_item_id | STRING | 9025 | -1 | 16 | 16
+| i_rec_start_date | TIMESTAMP | 4 | -1 | 16 | 16
+| i_rec_end_date | TIMESTAMP | 3 | -1 | 16 | 16
+| i_item_desc | STRING | 13330 | -1 | 200 | 100.3028030395
+| i_current_price | FLOAT | 2807 | -1 | 4 | 4
+| i_wholesale_cost | FLOAT | 2105 | -1 | 4 | 4
+| i_brand_id | INT | 965 | -1 | 4 | 4
+| i_brand | STRING | 725 | -1 | 22 | 16.17760086059
+| i_class_id | INT | 16 | -1 | 4 | 4
+| i_class | STRING | 101 | -1 | 15 | 7.767499923706
+| i_category_id | INT | 10 | -1 | 4 | 4
+| i_manufact_id | INT | 1857 | -1 | 4 | 4
+| i_manufact | STRING | 1028 | -1 | 15 | 11.32950019836
+| i_size | STRING | 8 | -1 | 11 | 4.334599971771
+| i_formulation | STRING | 12884 | -1 | 20 | 19.97999954223
+| i_color | STRING | 92 | -1 | 10 | 5.380899906158
+| i_units | STRING | 22 | -1 | 7 | 4.186900138854
+| i_container | STRING | 2 | -1 | 7 | 6.992599964141
+| i_manager_id | INT | 105 | -1 | 4 | 4
+| i_product_name | STRING | 19094 | -1 | 25 | 18.02330017089
+| i_category | STRING | 10 | 0 | -1 | -1
++------------------+-----------+------------------+--------+----------+---------------
+</codeblock>
+
+ <p>
+ To remove all statistics from the table, whether produced by <codeph>COMPUTE STATS</codeph> or
+ <codeph>COMPUTE INCREMENTAL STATS</codeph>, use the <codeph>DROP STATS</codeph> statement without the
+ <codeph>INCREMENTAL</codeph> clause). Now, both table-level and column-level statistics are reset.
+ </p>
+
+<codeblock>drop stats item_partitioned;
+
+show table stats item_partitioned
++-------------+-------+--------+----------+--------------+---------+------------------
+| i_category | #Rows | #Files | Size | Bytes Cached | Format | Incremental stats
++-------------+-------+--------+----------+--------------+---------+------------------
+| Books | -1 | 1 | 223.74KB | NOT CACHED | PARQUET | false
+| Children | -1 | 1 | 230.05KB | NOT CACHED | PARQUET | false
+| Electronics | -1 | 1 | 232.67KB | NOT CACHED | PARQUET | false
+| Home | -1 | 1 | 232.56KB | NOT CACHED | PARQUET | false
+| Jewelry | -1 | 1 | 223.72KB | NOT CACHED | PARQUET | false
+| Men | -1 | 1 | 231.25KB | NOT CACHED | PARQUET | false
+| Music | -1 | 1 | 237.90KB | NOT CACHED | PARQUET | false
+| Shoes | -1 | 1 | 234.90KB | NOT CACHED | PARQUET | false
+| Sports | -1 | 1 | 227.97KB | NOT CACHED | PARQUET | false
+| Women | -1 | 1 | 226.27KB | NOT CACHED | PARQUET | false
+| Total | -1 | 10 | 2.25MB | 0B | |
++-------------+-------+--------+----------+--------------+---------+------------------
+show column stats item_partitioned
++------------------+-----------+------------------+--------+----------+----------+
+| Column | Type | #Distinct Values | #Nulls | Max Size | Avg Size |
++------------------+-----------+------------------+--------+----------+----------+
+| i_item_sk | INT | -1 | -1 | 4 | 4 |
+| i_item_id | STRING | -1 | -1 | -1 | -1 |
+| i_rec_start_date | TIMESTAMP | -1 | -1 | 16 | 16 |
+| i_rec_end_date | TIMESTAMP | -1 | -1 | 16 | 16 |
+| i_item_desc | STRING | -1 | -1 | -1 | -1 |
+| i_current_price | FLOAT | -1 | -1 | 4 | 4 |
+| i_wholesale_cost | FLOAT | -1 | -1 | 4 | 4 |
+| i_brand_id | INT | -1 | -1 | 4 | 4 |
+| i_brand | STRING | -1 | -1 | -1 | -1 |
+| i_class_id | INT | -1 | -1 | 4 | 4 |
+| i_class | STRING | -1 | -1 | -1 | -1 |
+| i_category_id | INT | -1 | -1 | 4 | 4 |
+| i_manufact_id | INT | -1 | -1 | 4 | 4 |
+| i_manufact | STRING | -1 | -1 | -1 | -1 |
+| i_size | STRING | -1 | -1 | -1 | -1 |
+| i_formulation | STRING | -1 | -1 | -1 | -1 |
+| i_color | STRING | -1 | -1 | -1 | -1 |
+| i_units | STRING | -1 | -1 | -1 | -1 |
+| i_container | STRING | -1 | -1 | -1 | -1 |
+| i_manager_id | INT | -1 | -1 | 4 | 4 |
+| i_product_name | STRING | -1 | -1 | -1 | -1 |
+| i_category | STRING | 10 | 0 | -1 | -1 |
++------------------+-----------+------------------+--------+----------+----------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_compute_stats.xml#compute_stats"/>, <xref href="impala_show.xml#show_table_stats"/>,
+ <xref href="impala_show.xml#show_column_stats"/>, <xref href="impala_perf_stats.xml#perf_stats"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_drop_table.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_drop_table.xml b/docs/topics/impala_drop_table.xml
new file mode 100644
index 0000000..33cb726
--- /dev/null
+++ b/docs/topics/impala_drop_table.xml
@@ -0,0 +1,142 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="drop_table">
+
+ <title>DROP TABLE Statement</title>
+ <titlealts><navtitle>DROP TABLE</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="DDL"/>
+ <data name="Category" value="Tables"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">DROP TABLE statement</indexterm>
+ Removes an Impala table. Also removes the underlying HDFS data files for internal tables, although not for
+ external tables.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>DROP TABLE [IF EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname> <ph rev="2.3.0">[PURGE]</ph></codeblock>
+
+ <p>
+ <b>IF EXISTS clause:</b>
+ </p>
+
+ <p>
+ The optional <codeph>IF EXISTS</codeph> clause makes the statement succeed whether or not the table exists.
+ If the table does exist, it is dropped; if it does not exist, the statement has no effect. This capability is
+ useful in standardized setup scripts that remove existing schema objects and create new ones. By using some
+ combination of <codeph>IF EXISTS</codeph> for the <codeph>DROP</codeph> statements and <codeph>IF NOT
+ EXISTS</codeph> clauses for the <codeph>CREATE</codeph> statements, the script can run successfully the first
+ time you run it (when the objects do not exist yet) and subsequent times (when some or all of the objects do
+ already exist).
+ </p>
+
+ <p rev="2.3.0">
+ <b>PURGE clause:</b>
+ </p>
+
+ <p rev="2.3.0">
+ The optional <codeph>PURGE</codeph> keyword, available in CDH 5.5 / Impala 2.3 and higher,
+ causes Impala to remove the associated HDFS data files
+ immediately, rather than going through the HDFS trashcan mechanism. Use this keyword when dropping
+ a table if it is crucial to remove the data as quickly as possible to free up space, or if there is
+ a problem with the trashcan, such as the trashcan not being configured or being in a different
+ HDFS encryption zone than the data files.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/ddl_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ By default, Impala removes the associated HDFS directory and data files for the table. If you issue a
+ <codeph>DROP TABLE</codeph> and the data files are not deleted, it might be for the following reasons:
+ </p>
+
+ <ul>
+ <li>
+ If the table was created with the
+ <codeph><xref href="impala_tables.xml#external_tables">EXTERNAL</xref></codeph> clause, Impala leaves all
+ files and directories untouched. Use external tables when the data is under the control of other Hadoop
+ components, and Impala is only used to query the data files from their original locations.
+ </li>
+
+ <li>
+ Impala might leave the data files behind unintentionally, if there is no HDFS location available to hold
+ the HDFS trashcan for the <codeph>impala</codeph> user. See
+ <xref href="impala_prereqs.xml#prereqs_account"/> for the procedure to set up the required HDFS home
+ directory.
+ </li>
+ </ul>
+
+ <p>
+ Make sure that you are in the correct database before dropping a table, either by issuing a
+ <codeph>USE</codeph> statement first or by using a fully qualified name
+ <codeph><varname>db_name</varname>.<varname>table_name</varname></codeph>.
+ </p>
+
+ <p>
+ If you intend to issue a <codeph>DROP DATABASE</codeph> statement, first issue <codeph>DROP TABLE</codeph>
+ statements to remove all the tables in that database.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>create database temporary;
+use temporary;
+create table unimportant (x int);
+create table trivial (s string);
+-- Drop a table in the current database.
+drop table unimportant;
+-- Switch to a different database.
+use default;
+-- To drop a table in a different database...
+drop table trivial;
+<i>ERROR: AnalysisException: Table does not exist: default.trivial</i>
+-- ...use a fully qualified name.
+drop table temporary.trivial;</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/disk_space_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/s3_blurb"/>
+ <p rev="2.2.0">
+ Although Impala cannot write new data to a table stored in the Amazon
+ S3 filesystem, the <codeph>DROP TABLE</codeph> statement can remove data files from S3
+ if the associated S3 table is an internal table.
+ See <xref href="impala_s3.xml#s3"/> for details about working with S3 tables.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
+ <p rev="CDH-19187">
+ For an internal table, the user ID that the <cmdname>impalad</cmdname> daemon runs under,
+ typically the <codeph>impala</codeph> user, must have write
+ permission for all the files and directories that make up the table.
+ </p>
+ <p>
+ For an external table, dropping the table only involves changes to metadata in the metastore database.
+ Because Impala does not remove any HDFS files or directories when external tables are dropped,
+ no particular permissions are needed for the associated HDFS files or directories.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_tables.xml#tables"/>,
+ <xref href="impala_alter_table.xml#alter_table"/>, <xref href="impala_create_table.xml#create_table"/>,
+ <xref href="impala_partitioning.xml#partitioning"/>, <xref href="impala_tables.xml#internal_tables"/>,
+ <xref href="impala_tables.xml#external_tables"/>
+ </p>
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_drop_view.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_drop_view.xml b/docs/topics/impala_drop_view.xml
new file mode 100644
index 0000000..edcab58
--- /dev/null
+++ b/docs/topics/impala_drop_view.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.1" id="drop_view">
+
+ <title>DROP VIEW Statement</title>
+ <titlealts><navtitle>DROP VIEW</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="DDL"/>
+ <data name="Category" value="Schemas"/>
+ <data name="Category" value="Tables"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">DROP VIEW statement</indexterm>
+ Removes the specified view, which was originally created by the <codeph>CREATE VIEW</codeph> statement.
+ Because a view is purely a logical construct (an alias for a query) with no physical data behind it,
+ <codeph>DROP VIEW</codeph> only involves changes to metadata in the metastore database, not any data files in
+ HDFS.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>DROP VIEW [IF EXISTS] [<varname>db_name</varname>.]<varname>view_name</varname></codeblock>
+
+ <p conref="../shared/impala_common.xml#common/ddl_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/create_drop_view_examples"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_views.xml#views"/>, <xref href="impala_create_view.xml#create_view"/>,
+ <xref href="impala_alter_view.xml#alter_view"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_exec_single_node_rows_threshold.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_exec_single_node_rows_threshold.xml b/docs/topics/impala_exec_single_node_rows_threshold.xml
new file mode 100644
index 0000000..fa3007d
--- /dev/null
+++ b/docs/topics/impala_exec_single_node_rows_threshold.xml
@@ -0,0 +1,91 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="2.0.0" id="exec_single_node_rows_threshold" xml:lang="en-US">
+
+ <title>EXEC_SINGLE_NODE_ROWS_THRESHOLD Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ <data name="Category" value="Scalability"/>
+ <data name="Category" value="Performance"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">EXEC_SINGLE_NODE_ROWS_THRESHOLD query option</indexterm>
+ This setting controls the cutoff point (in terms of number of rows scanned) below which Impala treats a query
+ as a <q>small</q> query, turning off optimizations such as parallel execution and native code generation. The
+ overhead for these optimizations is applicable for queries involving substantial amounts of data, but it
+ makes sense to skip them for queries involving tiny amounts of data. Reducing the overhead for small queries
+ allows Impala to complete them more quickly, keeping YARN resources, admission control slots, and so on
+ available for data-intensive queries.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>SET EXEC_SINGLE_NODE_ROWS_THRESHOLD=<varname>number_of_rows</varname></codeblock>
+
+ <p>
+ <b>Type:</b> numeric
+ </p>
+
+ <p>
+ <b>Default:</b> 100
+ </p>
+
+ <p>
+ <b>Usage notes:</b> Typically, you increase the default value to make this optimization apply to more queries.
+ If incorrect or corrupted table and column statistics cause Impala to apply this optimization
+ incorrectly to queries that actually involve substantial work, you might see the queries being slower as a
+ result of remote reads. In that case, recompute statistics with the <codeph>COMPUTE STATS</codeph>
+ or <codeph>COMPUTE INCREMENTAL STATS</codeph> statement. If there is a problem collecting accurate
+ statistics, you can turn this feature off by setting the value to -1.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/internals_blurb"/>
+
+ <p>
+ This setting applies to query fragments where the amount of data to scan can be accurately determined, either
+ through table and column statistics, or by the presence of a <codeph>LIMIT</codeph> clause. If Impala cannot
+ accurately estimate the size of the input data, this setting does not apply.
+ </p>
+
+ <p rev="2.3.0">
+ In CDH 5.5 / Impala 2.3 and higher, where Impala supports the complex data types <codeph>STRUCT</codeph>,
+ <codeph>ARRAY</codeph>, and <codeph>MAP</codeph>, if a query refers to any column of those types,
+ the small-query optimization is turned off for that query regardless of the
+ <codeph>EXEC_SINGLE_NODE_ROWS_THRESHOLD</codeph> setting.
+ </p>
+
+ <p>
+ For a query that is determined to be <q>small</q>, all work is performed on the coordinator node. This might
+ result in some I/O being performed by remote reads. The savings from not distributing the query work and not
+ generating native code are expected to outweigh any overhead from the remote reads.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ A common use case is to query just a few rows from a table to inspect typical data values. In this example,
+ Impala does not parallelize the query or perform native code generation because the result set is guaranteed
+ to be smaller than the threshold value from this query option:
+ </p>
+
+<codeblock>SET EXEC_SINGLE_NODE_ROWS_THRESHOLD=500;
+SELECT * FROM enormous_table LIMIT 300;
+</codeblock>
+
+<!-- Don't have any other places that tie into this particular optimization technique yet.
+Potentially: conceptual topics about code generation, distributed queries
+
+<p conref="/Content/impala_common_xi44078.xml#common/related_info"/>
+<p>
+</p>
+-->
+
+ </conbody>
+
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_explain.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_explain.xml b/docs/topics/impala_explain.xml
new file mode 100644
index 0000000..c9e8846
--- /dev/null
+++ b/docs/topics/impala_explain.xml
@@ -0,0 +1,224 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="explain">
+
+ <title>EXPLAIN Statement</title>
+ <titlealts><navtitle>EXPLAIN</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Reports"/>
+ <data name="Category" value="Planning"/>
+ <data name="Category" value="Performance"/>
+ <data name="Category" value="Troubleshooting"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">EXPLAIN statement</indexterm>
+ Returns the execution plan for a statement, showing the low-level mechanisms that Impala will use to read the
+ data, divide the work among nodes in the cluster, and transmit intermediate and final results across the
+ network. Use <codeph>explain</codeph> followed by a complete <codeph>SELECT</codeph> query. For example:
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>EXPLAIN { <varname>select_query</varname> | <varname>ctas_stmt</varname> | <varname>insert_stmt</varname> }
+</codeblock>
+
+ <p>
+ The <varname>select_query</varname> is a <codeph>SELECT</codeph> statement, optionally prefixed by a
+ <codeph>WITH</codeph> clause. See <xref href="impala_select.xml#select"/> for details.
+ </p>
+
+ <p>
+ The <varname>insert_stmt</varname> is an <codeph>INSERT</codeph> statement that inserts into or overwrites an
+ existing table. It can use either the <codeph>INSERT ... SELECT</codeph> or <codeph>INSERT ...
+ VALUES</codeph> syntax. See <xref href="impala_insert.xml#insert"/> for details.
+ </p>
+
+ <p>
+ The <varname>ctas_stmt</varname> is a <codeph>CREATE TABLE</codeph> statement using the <codeph>AS
+ SELECT</codeph> clause, typically abbreviated as a <q>CTAS</q> operation. See
+ <xref href="impala_create_table.xml#create_table"/> for details.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ You can interpret the output to judge whether the query is performing efficiently, and adjust the query
+ and/or the schema if not. For example, you might change the tests in the <codeph>WHERE</codeph> clause, add
+ hints to make join operations more efficient, introduce subqueries, change the order of tables in a join, add
+ or change partitioning for a table, collect column statistics and/or table statistics in Hive, or any other
+ performance tuning steps.
+ </p>
+
+ <p>
+ The <codeph>EXPLAIN</codeph> output reminds you if table or column statistics are missing from any table
+ involved in the query. These statistics are important for optimizing queries involving large tables or
+ multi-table joins. See <xref href="impala_compute_stats.xml#compute_stats"/> for how to gather statistics,
+ and <xref href="impala_perf_stats.xml#perf_stats"/> for how to use this information for query tuning.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/explain_interpret"/>
+
+ <p>
+ If you come from a traditional database background and are not familiar with data warehousing, keep in mind
+ that Impala is optimized for full table scans across very large tables. The structure and distribution of
+ this data is typically not suitable for the kind of indexing and single-row lookups that are common in OLTP
+ environments. Seeing a query scan entirely through a large table is common, not necessarily an indication of
+ an inefficient query. Of course, if you can reduce the volume of scanned data by orders of magnitude, for
+ example by using a query that affects only certain partitions within a partitioned table, then you might be
+ able to optimize a query so that it executes in seconds rather than minutes.
+ </p>
+
+ <p>
+ For more information and examples to help you interpret <codeph>EXPLAIN</codeph> output, see
+ <xref href="impala_explain_plan.xml#perf_explain"/>.
+ </p>
+
+ <p rev="1.2">
+ <b>Extended EXPLAIN output:</b>
+ </p>
+
+ <p rev="1.2">
+ For performance tuning of complex queries, and capacity planning (such as using the admission control and
+ resource management features), you can enable more detailed and informative output for the
+ <codeph>EXPLAIN</codeph> statement. In the <cmdname>impala-shell</cmdname> interpreter, issue the command
+ <codeph>SET EXPLAIN_LEVEL=<varname>level</varname></codeph>, where <varname>level</varname> is an integer
+ from 0 to 3 or corresponding mnemonic values <codeph>minimal</codeph>, <codeph>standard</codeph>,
+ <codeph>extended</codeph>, or <codeph>verbose</codeph>.
+ </p>
+
+ <p rev="1.2">
+ When extended <codeph>EXPLAIN</codeph> output is enabled, <codeph>EXPLAIN</codeph> statements print
+ information about estimated memory requirements, minimum number of virtual cores, and so on that you can use
+ to fine-tune the resource management options explained in
+ <xref href="impala_resource_management.xml#rm_options"/>. (The estimated memory requirements are
+ intentionally on the high side, to allow a margin for error, to avoid cancelling a query unnecessarily if you
+ set the <codeph>MEM_LIMIT</codeph> option to the estimated memory figure.)
+ </p>
+
+ <p>
+ See <xref href="impala_explain_level.xml#explain_level"/> for details and examples.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ This example shows how the standard <codeph>EXPLAIN</codeph> output moves from the lowest (physical) level to
+ the higher (logical) levels. The query begins by scanning a certain amount of data; each node performs an
+ aggregation operation (evaluating <codeph>COUNT(*)</codeph>) on some subset of data that is local to that
+ node; the intermediate results are transmitted back to the coordinator node (labelled here as the
+ <codeph>EXCHANGE</codeph> node); lastly, the intermediate results are summed to display the final result.
+ </p>
+
+<codeblock id="explain_plan_simple">[impalad-host:21000] > explain select count(*) from customer_address;
++----------------------------------------------------------+
+| Explain String |
++----------------------------------------------------------+
+| Estimated Per-Host Requirements: Memory=42.00MB VCores=1 |
+| |
+| 03:AGGREGATE [MERGE FINALIZE] |
+| | output: sum(count(*)) |
+| | |
+| 02:EXCHANGE [PARTITION=UNPARTITIONED] |
+| | |
+| 01:AGGREGATE |
+| | output: count(*) |
+| | |
+| 00:SCAN HDFS [default.customer_address] |
+| partitions=1/1 size=5.25MB |
++----------------------------------------------------------+
+</codeblock>
+
+ <p>
+ These examples show how the extended <codeph>EXPLAIN</codeph> output becomes more accurate and informative as
+ statistics are gathered by the <codeph>COMPUTE STATS</codeph> statement. Initially, much of the information
+ about data size and distribution is marked <q>unavailable</q>. Impala can determine the raw data size, but
+ not the number of rows or number of distinct values for each column without additional analysis. The
+ <codeph>COMPUTE STATS</codeph> statement performs this analysis, so a subsequent <codeph>EXPLAIN</codeph>
+ statement has additional information to use in deciding how to optimize the distributed query.
+ </p>
+
+ <draft-comment translate="no">
+Re-run these examples with more substantial tables populated with data.
+</draft-comment>
+
+<codeblock rev="1.2">[localhost:21000] > set explain_level=extended;
+EXPLAIN_LEVEL set to extended
+[localhost:21000] > explain select x from t1;
+[localhost:21000] > explain select x from t1;
++----------------------------------------------------------+
+| Explain String |
++----------------------------------------------------------+
+| Estimated Per-Host Requirements: Memory=32.00MB VCores=1 |
+| |
+| 01:EXCHANGE [PARTITION=UNPARTITIONED] |
+| | hosts=1 per-host-mem=unavailable |
+<b>| | tuple-ids=0 row-size=4B cardinality=unavailable |</b>
+| | |
+| 00:SCAN HDFS [default.t2, PARTITION=RANDOM] |
+| partitions=1/1 size=36B |
+<b>| table stats: unavailable |</b>
+<b>| column stats: unavailable |</b>
+| hosts=1 per-host-mem=32.00MB |
+<b>| tuple-ids=0 row-size=4B cardinality=unavailable |</b>
++----------------------------------------------------------+
+</codeblock>
+
+<codeblock rev="1.2">[localhost:21000] > compute stats t1;
++-----------------------------------------+
+| summary |
++-----------------------------------------+
+| Updated 1 partition(s) and 1 column(s). |
++-----------------------------------------+
+[localhost:21000] > explain select x from t1;
++----------------------------------------------------------+
+| Explain String |
++----------------------------------------------------------+
+| Estimated Per-Host Requirements: Memory=64.00MB VCores=1 |
+| |
+| 01:EXCHANGE [PARTITION=UNPARTITIONED] |
+| | hosts=1 per-host-mem=unavailable |
+| | tuple-ids=0 row-size=4B cardinality=0 |
+| | |
+| 00:SCAN HDFS [default.t1, PARTITION=RANDOM] |
+| partitions=1/1 size=36B |
+<b>| table stats: 0 rows total |</b>
+<b>| column stats: all |</b>
+| hosts=1 per-host-mem=64.00MB |
+<b>| tuple-ids=0 row-size=4B cardinality=0 |</b>
++----------------------------------------------------------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/security_blurb"/>
+ <p conref="../shared/impala_common.xml#common/redaction_yes"/>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
+ <p rev="CDH-19187">
+ <!-- Doublecheck these details. Does EXPLAIN really need any permissions? -->
+ The user ID that the <cmdname>impalad</cmdname> daemon runs under,
+ typically the <codeph>impala</codeph> user, must have read
+ and execute permissions for all applicable directories in all source tables
+ for the query that is being explained.
+ (A <codeph>SELECT</codeph> operation could read files from multiple different HDFS directories
+ if the source table is partitioned.)
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+ <p>
+ <xref href="impala_select.xml#select"/>,
+ <xref href="impala_insert.xml#insert"/>,
+ <xref href="impala_create_table.xml#create_table"/>,
+ <xref href="impala_explain_plan.xml#explain_plan"/>
+ </p>
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_explain_level.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_explain_level.xml b/docs/topics/impala_explain_level.xml
new file mode 100644
index 0000000..f54e8a8
--- /dev/null
+++ b/docs/topics/impala_explain_level.xml
@@ -0,0 +1,338 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.2" id="explain_level">
+
+ <title>EXPLAIN_LEVEL Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ <data name="Category" value="Troubleshooting"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Performance"/>
+ <data name="Category" value="Reports"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">EXPLAIN_LEVEL query option</indexterm>
+ Controls the amount of detail provided in the output of the <codeph>EXPLAIN</codeph> statement. The basic
+ output can help you identify high-level performance issues such as scanning a higher volume of data or more
+ partitions than you expect. The higher levels of detail show how intermediate results flow between nodes and
+ how different SQL operations such as <codeph>ORDER BY</codeph>, <codeph>GROUP BY</codeph>, joins, and
+ <codeph>WHERE</codeph> clauses are implemented within a distributed query.
+ </p>
+
+ <p>
+ <b>Type:</b> <codeph>STRING</codeph> or <codeph>INT</codeph>
+ </p>
+
+ <p>
+ <b>Default:</b> <codeph>1</codeph>
+ </p>
+
+ <p>
+ <b>Arguments:</b>
+ </p>
+
+ <p>
+ The allowed range of numeric values for this option is 0 to 3:
+ </p>
+
+ <ul>
+ <li>
+ <codeph>0</codeph> or <codeph>MINIMAL</codeph>: A barebones list, one line per operation. Primarily useful
+ for checking the join order in very long queries where the regular <codeph>EXPLAIN</codeph> output is too
+ long to read easily.
+ </li>
+
+ <li>
+ <codeph>1</codeph> or <codeph>STANDARD</codeph>: The default level of detail, showing the logical way that
+ work is split up for the distributed query.
+ </li>
+
+ <li>
+ <codeph>2</codeph> or <codeph>EXTENDED</codeph>: Includes additional detail about how the query planner
+ uses statistics in its decision-making process, to understand how a query could be tuned by gathering
+ statistics, using query hints, adding or removing predicates, and so on.
+ </li>
+
+ <li>
+ <codeph>3</codeph> or <codeph>VERBOSE</codeph>: The maximum level of detail, showing how work is split up
+ within each node into <q>query fragments</q> that are connected in a pipeline. This extra detail is
+ primarily useful for low-level performance testing and tuning within Impala itself, rather than for
+ rewriting the SQL code at the user level.
+ </li>
+ </ul>
+
+ <note>
+ Prior to Impala 1.3, the allowed argument range for <codeph>EXPLAIN_LEVEL</codeph> was 0 to 1: level 0 had
+ the mnemonic <codeph>NORMAL</codeph>, and level 1 was <codeph>VERBOSE</codeph>. In Impala 1.3 and higher,
+ <codeph>NORMAL</codeph> is not a valid mnemonic value, and <codeph>VERBOSE</codeph> still applies to the
+ highest level of detail but now corresponds to level 3. You might need to adjust the values if you have any
+ older <codeph>impala-shell</codeph> script files that set the <codeph>EXPLAIN_LEVEL</codeph> query option.
+ </note>
+
+ <p>
+ Changing the value of this option controls the amount of detail in the output of the <codeph>EXPLAIN</codeph>
+ statement. The extended information from level 2 or 3 is especially useful during performance tuning, when
+ you need to confirm whether the work for the query is distributed the way you expect, particularly for the
+ most resource-intensive operations such as join queries against large tables, queries against tables with
+ large numbers of partitions, and insert operations for Parquet tables. The extended information also helps to
+ check estimated resource usage when you use the admission control or resource management features explained
+ in <xref href="impala_resource_management.xml#resource_management"/>. See
+ <xref href="impala_explain.xml#explain"/> for the syntax of the <codeph>EXPLAIN</codeph> statement, and
+ <xref href="impala_explain_plan.xml#perf_explain"/> for details about how to use the extended information.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ As always, read the <codeph>EXPLAIN</codeph> output from bottom to top. The lowest lines represent the
+ initial work of the query (scanning data files), the lines in the middle represent calculations done on each
+ node and how intermediate results are transmitted from one node to another, and the topmost lines represent
+ the final results being sent back to the coordinator node.
+ </p>
+
+ <p>
+ The numbers in the left column are generated internally during the initial planning phase and do not
+ represent the actual order of operations, so it is not significant if they appear out of order in the
+ <codeph>EXPLAIN</codeph> output.
+ </p>
+
+ <p>
+ At all <codeph>EXPLAIN</codeph> levels, the plan contains a warning if any tables in the query are missing
+ statistics. Use the <codeph>COMPUTE STATS</codeph> statement to gather statistics for each table and suppress
+ this warning. See <xref href="impala_perf_stats.xml#perf_stats"/> for details about how the statistics help
+ query performance.
+ </p>
+
+ <p>
+ The <codeph>PROFILE</codeph> command in <cmdname>impala-shell</cmdname> always starts with an explain plan
+ showing full detail, the same as with <codeph>EXPLAIN_LEVEL=3</codeph>. <ph rev="1.4.0">After the explain
+ plan comes the executive summary, the same output as produced by the <codeph>SUMMARY</codeph> command in
+ <cmdname>impala-shell</cmdname>.</ph>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ These examples use a trivial, empty table to illustrate how the essential aspects of query planning are shown
+ in <codeph>EXPLAIN</codeph> output:
+ </p>
+
+<codeblock>[localhost:21000] > create table t1 (x int, s string);
+[localhost:21000] > set explain_level=1;
+[localhost:21000] > explain select count(*) from t1;
++------------------------------------------------------------------------------------+
+| Explain String |
++------------------------------------------------------------------------------------+
+| Estimated Per-Host Requirements: Memory=10.00MB VCores=1 |
+| WARNING: The following tables are missing relevant table and/or column statistics. |
+| explain_plan.t1 |
+| |
+| 03:AGGREGATE [MERGE FINALIZE] |
+| | output: sum(count(*)) |
+| | |
+| 02:EXCHANGE [PARTITION=UNPARTITIONED] |
+| | |
+| 01:AGGREGATE |
+| | output: count(*) |
+| | |
+| 00:SCAN HDFS [explain_plan.t1] |
+| partitions=1/1 size=0B |
++------------------------------------------------------------------------------------+
+[localhost:21000] > explain select * from t1;
++------------------------------------------------------------------------------------+
+| Explain String |
++------------------------------------------------------------------------------------+
+| Estimated Per-Host Requirements: Memory=-9223372036854775808B VCores=0 |
+| WARNING: The following tables are missing relevant table and/or column statistics. |
+| explain_plan.t1 |
+| |
+| 01:EXCHANGE [PARTITION=UNPARTITIONED] |
+| | |
+| 00:SCAN HDFS [explain_plan.t1] |
+| partitions=1/1 size=0B |
++------------------------------------------------------------------------------------+
+[localhost:21000] > set explain_level=2;
+[localhost:21000] > explain select * from t1;
++------------------------------------------------------------------------------------+
+| Explain String |
++------------------------------------------------------------------------------------+
+| Estimated Per-Host Requirements: Memory=-9223372036854775808B VCores=0 |
+| WARNING: The following tables are missing relevant table and/or column statistics. |
+| explain_plan.t1 |
+| |
+| 01:EXCHANGE [PARTITION=UNPARTITIONED] |
+| | hosts=0 per-host-mem=unavailable |
+| | tuple-ids=0 row-size=19B cardinality=unavailable |
+| | |
+| 00:SCAN HDFS [explain_plan.t1, PARTITION=RANDOM] |
+| partitions=1/1 size=0B |
+| table stats: unavailable |
+| column stats: unavailable |
+| hosts=0 per-host-mem=0B |
+| tuple-ids=0 row-size=19B cardinality=unavailable |
++------------------------------------------------------------------------------------+
+[localhost:21000] > set explain_level=3;
+[localhost:21000] > explain select * from t1;
++------------------------------------------------------------------------------------+
+| Explain String |
++------------------------------------------------------------------------------------+
+| Estimated Per-Host Requirements: Memory=-9223372036854775808B VCores=0 |
+<b>| WARNING: The following tables are missing relevant table and/or column statistics. |</b>
+<b>| explain_plan.t1 |</b>
+| |
+| F01:PLAN FRAGMENT [PARTITION=UNPARTITIONED] |
+| 01:EXCHANGE [PARTITION=UNPARTITIONED] |
+| hosts=0 per-host-mem=unavailable |
+| tuple-ids=0 row-size=19B cardinality=unavailable |
+| |
+| F00:PLAN FRAGMENT [PARTITION=RANDOM] |
+| DATASTREAM SINK [FRAGMENT=F01, EXCHANGE=01, PARTITION=UNPARTITIONED] |
+| 00:SCAN HDFS [explain_plan.t1, PARTITION=RANDOM] |
+| partitions=1/1 size=0B |
+<b>| table stats: unavailable |</b>
+<b>| column stats: unavailable |</b>
+| hosts=0 per-host-mem=0B |
+| tuple-ids=0 row-size=19B cardinality=unavailable |
++------------------------------------------------------------------------------------+
+</codeblock>
+
+ <p>
+ As the warning message demonstrates, most of the information needed for Impala to do efficient query
+ planning, and for you to understand the performance characteristics of the query, requires running the
+ <codeph>COMPUTE STATS</codeph> statement for the table:
+ </p>
+
+<codeblock>[localhost:21000] > compute stats t1;
++-----------------------------------------+
+| summary |
++-----------------------------------------+
+| Updated 1 partition(s) and 2 column(s). |
++-----------------------------------------+
+[localhost:21000] > explain select * from t1;
++------------------------------------------------------------------------+
+| Explain String |
++------------------------------------------------------------------------+
+| Estimated Per-Host Requirements: Memory=-9223372036854775808B VCores=0 |
+| |
+| F01:PLAN FRAGMENT [PARTITION=UNPARTITIONED] |
+| 01:EXCHANGE [PARTITION=UNPARTITIONED] |
+| hosts=0 per-host-mem=unavailable |
+| tuple-ids=0 row-size=20B cardinality=0 |
+| |
+| F00:PLAN FRAGMENT [PARTITION=RANDOM] |
+| DATASTREAM SINK [FRAGMENT=F01, EXCHANGE=01, PARTITION=UNPARTITIONED] |
+| 00:SCAN HDFS [explain_plan.t1, PARTITION=RANDOM] |
+| partitions=1/1 size=0B |
+<b>| table stats: 0 rows total |</b>
+<b>| column stats: all |</b>
+| hosts=0 per-host-mem=0B |
+| tuple-ids=0 row-size=20B cardinality=0 |
++------------------------------------------------------------------------+
+</codeblock>
+
+ <p>
+ Joins and other complicated, multi-part queries are the ones where you most commonly need to examine the
+ <codeph>EXPLAIN</codeph> output and customize the amount of detail in the output. This example shows the
+ default <codeph>EXPLAIN</codeph> output for a three-way join query, then the equivalent output with a
+ <codeph>[SHUFFLE]</codeph> hint to change the join mechanism between the first two tables from a broadcast
+ join to a shuffle join.
+ </p>
+
+<codeblock>[localhost:21000] > set explain_level=1;
+[localhost:21000] > explain select one.*, two.*, three.* from t1 one, t1 two, t1 three where one.x = two.x and two.x = three.x;
++------------------------------------------------------------------------------------+
+| Explain String |
++------------------------------------------------------------------------------------+
+| Estimated Per-Host Requirements: Memory=4.00GB VCores=3 |
+| |
+| 07:EXCHANGE [PARTITION=UNPARTITIONED] |
+| | |
+<b>| 04:HASH JOIN [INNER JOIN, BROADCAST] |</b>
+| | hash predicates: two.x = three.x |
+| | |
+<b>| |--06:EXCHANGE [BROADCAST] |</b>
+| | | |
+| | 02:SCAN HDFS [explain_plan.t1 three] |
+| | partitions=1/1 size=0B |
+| | |
+<b>| 03:HASH JOIN [INNER JOIN, BROADCAST] |</b>
+| | hash predicates: one.x = two.x |
+| | |
+<b>| |--05:EXCHANGE [BROADCAST] |</b>
+| | | |
+| | 01:SCAN HDFS [explain_plan.t1 two] |
+| | partitions=1/1 size=0B |
+| | |
+| 00:SCAN HDFS [explain_plan.t1 one] |
+| partitions=1/1 size=0B |
++------------------------------------------------------------------------------------+
+[localhost:21000] > explain select one.*, two.*, three.* from t1 one join [shuffle] t1 two join t1 three where one.x = two.x and two.x = three.x;
++------------------------------------------------------------------------------------+
+| Explain String |
++------------------------------------------------------------------------------------+
+| Estimated Per-Host Requirements: Memory=4.00GB VCores=3 |
+| |
+| 08:EXCHANGE [PARTITION=UNPARTITIONED] |
+| | |
+<b>| 04:HASH JOIN [INNER JOIN, BROADCAST] |</b>
+| | hash predicates: two.x = three.x |
+| | |
+<b>| |--07:EXCHANGE [BROADCAST] |</b>
+| | | |
+| | 02:SCAN HDFS [explain_plan.t1 three] |
+| | partitions=1/1 size=0B |
+| | |
+<b>| 03:HASH JOIN [INNER JOIN, PARTITIONED] |</b>
+| | hash predicates: one.x = two.x |
+| | |
+<b>| |--06:EXCHANGE [PARTITION=HASH(two.x)] |</b>
+| | | |
+| | 01:SCAN HDFS [explain_plan.t1 two] |
+| | partitions=1/1 size=0B |
+| | |
+<b>| 05:EXCHANGE [PARTITION=HASH(one.x)] |</b>
+| | |
+| 00:SCAN HDFS [explain_plan.t1 one] |
+| partitions=1/1 size=0B |
++------------------------------------------------------------------------------------+
+</codeblock>
+
+ <p>
+ For a join involving many different tables, the default <codeph>EXPLAIN</codeph> output might stretch over
+ several pages, and the only details you care about might be the join order and the mechanism (broadcast or
+ shuffle) for joining each pair of tables. In that case, you might set <codeph>EXPLAIN_LEVEL</codeph> to its
+ lowest value of 0, to focus on just the join order and join mechanism for each stage. The following example
+ shows how the rows from the first and second joined tables are hashed and divided among the nodes of the
+ cluster for further filtering; then the entire contents of the third table are broadcast to all nodes for the
+ final stage of join processing.
+ </p>
+
+<codeblock>[localhost:21000] > set explain_level=0;
+[localhost:21000] > explain select one.*, two.*, three.* from t1 one join [shuffle] t1 two join t1 three where one.x = two.x and two.x = three.x;
++---------------------------------------------------------+
+| Explain String |
++---------------------------------------------------------+
+| Estimated Per-Host Requirements: Memory=4.00GB VCores=3 |
+| |
+| 08:EXCHANGE [PARTITION=UNPARTITIONED] |
+<b>| 04:HASH JOIN [INNER JOIN, BROADCAST] |</b>
+<b>| |--07:EXCHANGE [BROADCAST] |</b>
+| | 02:SCAN HDFS [explain_plan.t1 three] |
+<b>| 03:HASH JOIN [INNER JOIN, PARTITIONED] |</b>
+<b>| |--06:EXCHANGE [PARTITION=HASH(two.x)] |</b>
+| | 01:SCAN HDFS [explain_plan.t1 two] |
+<b>| 05:EXCHANGE [PARTITION=HASH(one.x)] |</b>
+| 00:SCAN HDFS [explain_plan.t1 one] |
++---------------------------------------------------------+
+</codeblock>
+
+<!-- Consider adding a related info section to collect the xrefs earlier on this page. -->
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_float.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_float.xml b/docs/topics/impala_float.xml
new file mode 100644
index 0000000..51e3311
--- /dev/null
+++ b/docs/topics/impala_float.xml
@@ -0,0 +1,94 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="float">
+
+ <title>FLOAT Data Type</title>
+ <titlealts><navtitle>FLOAT</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Data Types"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ A single precision floating-point data type used in <codeph>CREATE TABLE</codeph> and <codeph>ALTER
+ TABLE</codeph> statements.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+ <p>
+ In the column definition of a <codeph>CREATE TABLE</codeph> statement:
+ </p>
+
+<codeblock><varname>column_name</varname> FLOAT</codeblock>
+
+ <p>
+ <b>Range:</b> 1.40129846432481707e-45 .. 3.40282346638528860e+38, positive or negative
+ </p>
+
+ <p>
+ <b>Precision:</b> 6 to 9 significant digits, depending on usage. The number of significant digits does
+ not depend on the position of the decimal point.
+ </p>
+
+ <p>
+ <b>Representation:</b> The values are stored in 4 bytes, using
+ <xref href="https://en.wikipedia.org/wiki/Single-precision_floating-point_format" scope="external" format="html">IEEE 754 Single Precision Binary Floating Point</xref> format.
+ </p>
+
+ <p>
+ <b>Conversions:</b> Impala automatically converts <codeph>FLOAT</codeph> to more precise
+ <codeph>DOUBLE</codeph> values, but not the other way around. You can use <codeph>CAST()</codeph> to convert
+ <codeph>FLOAT</codeph> values to <codeph>TINYINT</codeph>, <codeph>SMALLINT</codeph>, <codeph>INT</codeph>,
+ <codeph>BIGINT</codeph>, <codeph>STRING</codeph>, <codeph>TIMESTAMP</codeph>, or <codeph>BOOLEAN</codeph>.
+ You can use exponential notation in <codeph>FLOAT</codeph> literals or when casting from
+ <codeph>STRING</codeph>, for example <codeph>1.0e6</codeph> to represent one million.
+ <ph conref="../shared/impala_common.xml#common/cast_int_to_timestamp"/>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>CREATE TABLE t1 (x FLOAT);
+SELECT CAST(1000.5 AS FLOAT);
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/partitioning_imprecise"/>
+
+ <p conref="../shared/impala_common.xml#common/hbase_ok"/>
+
+ <p conref="../shared/impala_common.xml#common/parquet_ok"/>
+
+ <p conref="../shared/impala_common.xml#common/text_bulky"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/compatibility_blurb"/> -->
+
+ <p conref="../shared/impala_common.xml#common/internals_4_bytes"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/added_in_20"/> -->
+
+ <p conref="../shared/impala_common.xml#common/column_stats_constant"/>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+<!-- This conref appears under SUM(), AVG(), FLOAT, and DOUBLE topics. -->
+
+ <p conref="../shared/impala_common.xml#common/sum_double"/>
+
+ <p conref="../shared/impala_common.xml#common/float_double_decimal_caveat"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_literals.xml#numeric_literals"/>, <xref href="impala_math_functions.xml#math_functions"/>,
+ <xref href="impala_double.xml#double"/>
+ </p>
+ </conbody>
+</concept>
[06/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Posted by jr...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_porting.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_porting.xml b/docs/topics/impala_porting.xml
new file mode 100644
index 0000000..c9c8e52
--- /dev/null
+++ b/docs/topics/impala_porting.xml
@@ -0,0 +1,622 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="porting">
+
+ <title>Porting SQL from Other Database Systems to Impala</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Databases"/>
+ <data name="Category" value="Hive"/>
+ <data name="Category" value="Oracle"/>
+ <data name="Category" value="MySQL"/>
+ <data name="Category" value="PostgreSQL"/>
+ <data name="Category" value="Troubleshooting"/>
+ <data name="Category" value="Porting"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">porting</indexterm>
+ Although Impala uses standard SQL for queries, you might need to modify SQL source when bringing applications
+ to Impala, due to variations in data types, built-in functions, vendor language extensions, and
+ Hadoop-specific syntax. Even when SQL is working correctly, you might make further minor modifications for
+ best performance.
+ </p>
+
+ <p outputclass="toc inpage"/>
+ </conbody>
+
+ <concept id="porting_ddl_dml">
+
+ <title>Porting DDL and DML Statements</title>
+
+ <conbody>
+
+ <p>
+ When adapting SQL code from a traditional database system to Impala, expect to find a number of differences
+ in the DDL statements that you use to set up the schema. Clauses related to physical layout of files,
+ tablespaces, and indexes have no equivalent in Impala. You might restructure your schema considerably to
+ account for the Impala partitioning scheme and Hadoop file formats.
+ </p>
+
+ <p>
+ Expect SQL queries to have a much higher degree of compatibility. With modest rewriting to address vendor
+ extensions and features not yet supported in Impala, you might be able to run identical or almost-identical
+ query text on both systems.
+ </p>
+
+ <p>
+ Therefore, consider separating out the DDL into a separate Impala-specific setup script. Focus your reuse
+ and ongoing tuning efforts on the code for SQL queries.
+ </p>
+ </conbody>
+ </concept>
+
+ <concept id="porting_data_types">
+
+ <title>Porting Data Types from Other Database Systems</title>
+
+ <conbody>
+
+ <ul>
+ <li>
+ <p>
+ Change any <codeph>VARCHAR</codeph>, <codeph>VARCHAR2</codeph>, and <codeph>CHAR</codeph> columns to
+ <codeph>STRING</codeph>. Remove any length constraints from the column declarations; for example,
+ change <codeph>VARCHAR(32)</codeph> or <codeph>CHAR(1)</codeph> to <codeph>STRING</codeph>. Impala is
+ very flexible about the length of string values; it does not impose any length constraints
+ or do any special processing (such as blank-padding) for <codeph>STRING</codeph> columns.
+ (In Impala 2.0 and higher, there are data types <codeph>VARCHAR</codeph> and <codeph>CHAR</codeph>,
+ with length constraints for both types and blank-padding for <codeph>CHAR</codeph>.
+ However, for performance reasons, it is still preferable to use <codeph>STRING</codeph>
+ columns where practical.)
+ </p>
+ </li>
+
+ <li>
+ <p>
+ For national language character types such as <codeph>NCHAR</codeph>, <codeph>NVARCHAR</codeph>, or
+ <codeph>NCLOB</codeph>, be aware that while Impala can store and query UTF-8 character data, currently
+ some string manipulation operations only work correctly with ASCII data. See
+ <xref href="impala_string.xml#string"/> for details.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Change any <codeph>DATE</codeph>, <codeph>DATETIME</codeph>, or <codeph>TIME</codeph> columns to
+ <codeph>TIMESTAMP</codeph>. Remove any precision constraints. Remove any timezone clauses, and make
+ sure your application logic or ETL process accounts for the fact that Impala expects all
+ <codeph>TIMESTAMP</codeph> values to be in
+ <xref href="http://en.wikipedia.org/wiki/Coordinated_Universal_Time" scope="external" format="html">Coordinated
+ Universal Time (UTC)</xref>. See <xref href="impala_timestamp.xml#timestamp"/> for information about
+ the <codeph>TIMESTAMP</codeph> data type, and
+ <xref href="impala_datetime_functions.xml#datetime_functions"/> for conversion functions for different
+ date and time formats.
+ </p>
+ <p>
+ You might also need to adapt date- and time-related literal values and format strings to use the
+ supported Impala date and time formats. If you have date and time literals with different separators or
+ different numbers of <codeph>YY</codeph>, <codeph>MM</codeph>, and so on placeholders than Impala
+ expects, consider using calls to <codeph>regexp_replace()</codeph> to transform those values to the
+ Impala-compatible format. See <xref href="impala_timestamp.xml#timestamp"/> for information about the
+ allowed formats for date and time literals, and
+ <xref href="impala_string_functions.xml#string_functions"/> for string conversion functions such as
+ <codeph>regexp_replace()</codeph>.
+ </p>
+ <p>
+ Instead of <codeph>SYSDATE</codeph>, call the function <codeph>NOW()</codeph>.
+ </p>
+ <p>
+ Instead of adding or subtracting directly from a date value to produce a value <varname>N</varname>
+ days in the past or future, use an <codeph>INTERVAL</codeph> expression, for example <codeph>NOW() +
+ INTERVAL 30 DAYS</codeph>.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Although Impala supports <codeph>INTERVAL</codeph> expressions for datetime arithmetic, as shown in
+ <xref href="impala_timestamp.xml#timestamp"/>, <codeph>INTERVAL</codeph> is not available as a column
+ data type in Impala. For any <codeph>INTERVAL</codeph> values stored in tables, convert them to numeric
+ values that you can add or subtract using the functions in
+ <xref href="impala_datetime_functions.xml#datetime_functions"/>. For example, if you had a table
+ <codeph>DEADLINES</codeph> with an <codeph>INT</codeph> column <codeph>TIME_PERIOD</codeph>, you could
+ construct dates N days in the future like so:
+ </p>
+<codeblock>SELECT NOW() + INTERVAL time_period DAYS from deadlines;</codeblock>
+ </li>
+
+ <li>
+ <p>
+ For <codeph>YEAR</codeph> columns, change to the smallest Impala integer type that has sufficient
+ range. See <xref href="impala_datatypes.xml#datatypes"/> for details about ranges, casting, and so on
+ for the various numeric data types.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Change any <codeph>DECIMAL</codeph> and <codeph>NUMBER</codeph> types. If fixed-point precision is not
+ required, you can use <codeph>FLOAT</codeph> or <codeph>DOUBLE</codeph> on the Impala side depending on
+ the range of values. For applications that require precise decimal values, such as financial data, you
+ might need to make more extensive changes to table structure and application logic, such as using
+ separate integer columns for dollars and cents, or encoding numbers as string values and writing UDFs
+ to manipulate them. See <xref href="impala_datatypes.xml#datatypes"/> for details about ranges,
+ casting, and so on for the various numeric data types.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ <codeph>FLOAT</codeph>, <codeph>DOUBLE</codeph>, and <codeph>REAL</codeph> types are supported in
+ Impala. Remove any precision and scale specifications. (In Impala, <codeph>REAL</codeph> is just an
+ alias for <codeph>DOUBLE</codeph>; columns declared as <codeph>REAL</codeph> are turned into
+ <codeph>DOUBLE</codeph> behind the scenes.) See <xref href="impala_datatypes.xml#datatypes"/> for
+ details about ranges, casting, and so on for the various numeric data types.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Most integer types from other systems have equivalents in Impala, perhaps under different names such as
+ <codeph>BIGINT</codeph> instead of <codeph>INT8</codeph>. For any that are unavailable, for example
+ <codeph>MEDIUMINT</codeph>, switch to the smallest Impala integer type that has sufficient range.
+ Remove any precision specifications. See <xref href="impala_datatypes.xml#datatypes"/> for details
+ about ranges, casting, and so on for the various numeric data types.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Remove any <codeph>UNSIGNED</codeph> constraints. All Impala numeric types are signed. See
+ <xref href="impala_datatypes.xml#datatypes"/> for details about ranges, casting, and so on for the
+ various numeric data types.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ For any types holding bitwise values, use an integer type with enough range to hold all the relevant
+ bits within a positive integer. See <xref href="impala_datatypes.xml#datatypes"/> for details about
+ ranges, casting, and so on for the various numeric data types.
+ </p>
+ <p>
+ For example, <codeph>TINYINT</codeph> has a maximum positive value of 127, not 256, so to manipulate
+ 8-bit bitfields as positive numbers switch to the next largest type <codeph>SMALLINT</codeph>.
+ </p>
+<codeblock>[localhost:21000] > select cast(127*2 as tinyint);
++--------------------------+
+| cast(127 * 2 as tinyint) |
++--------------------------+
+| -2 |
++--------------------------+
+[localhost:21000] > select cast(128 as tinyint);
++----------------------+
+| cast(128 as tinyint) |
++----------------------+
+| -128 |
++----------------------+
+[localhost:21000] > select cast(127*2 as smallint);
++---------------------------+
+| cast(127 * 2 as smallint) |
++---------------------------+
+| 254 |
++---------------------------+</codeblock>
+ <p>
+ Impala does not support notation such as <codeph>b'0101'</codeph> for bit literals.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ For BLOB values, use <codeph>STRING</codeph> to represent <codeph>CLOB</codeph> or
+ <codeph>TEXT</codeph> types (character based large objects) up to 32 KB in size. Binary large objects
+ such as <codeph>BLOB</codeph>, <codeph>RAW</codeph> <codeph>BINARY</codeph>, and
+ <codeph>VARBINARY</codeph> do not currently have an equivalent in Impala.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ For Boolean-like types such as <codeph>BOOL</codeph>, use the Impala <codeph>BOOLEAN</codeph> type.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Because Impala currently does not support composite or nested types, any spatial data types in other
+ database systems do not have direct equivalents in Impala. You could represent spatial values in string
+ format and write UDFs to process them. See <xref href="impala_udf.xml#udfs"/> for details. Where
+ practical, separate spatial types into separate tables so that Impala can still work with the
+ non-spatial data.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Take out any <codeph>DEFAULT</codeph> clauses. Impala can use data files produced from many different
+ sources, such as Pig, Hive, or MapReduce jobs. The fast import mechanisms of <codeph>LOAD DATA</codeph>
+ and external tables mean that Impala is flexible about the format of data files, and Impala does not
+ necessarily validate or cleanse data before querying it. When copying data through Impala
+ <codeph>INSERT</codeph> statements, you can use conditional functions such as <codeph>CASE</codeph> or
+ <codeph>NVL</codeph> to substitute some other value for <codeph>NULL</codeph> fields; see
+ <xref href="impala_conditional_functions.xml#conditional_functions"/> for details.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Take out any constraints from your <codeph>CREATE TABLE</codeph> and <codeph>ALTER TABLE</codeph>
+ statements, for example <codeph>PRIMARY KEY</codeph>, <codeph>FOREIGN KEY</codeph>,
+ <codeph>UNIQUE</codeph>, <codeph>NOT NULL</codeph>, <codeph>UNSIGNED</codeph>, or
+ <codeph>CHECK</codeph> constraints. Impala can use data files produced from many different sources,
+ such as Pig, Hive, or MapReduce jobs. Therefore, Impala expects initial data validation to happen
+ earlier during the ETL or ELT cycle. After data is loaded into Impala tables, you can perform queries
+ to test for <codeph>NULL</codeph> values. When copying data through Impala <codeph>INSERT</codeph>
+ statements, you can use conditional functions such as <codeph>CASE</codeph> or <codeph>NVL</codeph> to
+ substitute some other value for <codeph>NULL</codeph> fields; see
+ <xref href="impala_conditional_functions.xml#conditional_functions"/> for details.
+ </p>
+ <p>
+ Do as much verification as practical before loading data into Impala. After data is loaded into Impala,
+ you can do further verification using SQL queries to check if values have expected ranges, if values
+ are <codeph>NULL</codeph> or not, and so on. If there is a problem with the data, you will need to
+ re-run earlier stages of the ETL process, or do an <codeph>INSERT ... SELECT</codeph> statement in
+ Impala to copy the faulty data to a new table and transform or filter out the bad values.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Take out any <codeph>CREATE INDEX</codeph>, <codeph>DROP INDEX</codeph>, and <codeph>ALTER
+ INDEX</codeph> statements, and equivalent <codeph>ALTER TABLE</codeph> statements. Remove any
+ <codeph>INDEX</codeph>, <codeph>KEY</codeph>, or <codeph>PRIMARY KEY</codeph> clauses from
+ <codeph>CREATE TABLE</codeph> and <codeph>ALTER TABLE</codeph> statements. Impala is optimized for bulk
+ read operations for data warehouse-style queries, and therefore does not support indexes for its
+ tables.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Calls to built-in functions with out-of-range or otherwise incorrect arguments, return
+ <codeph>NULL</codeph> in Impala as opposed to raising exceptions. (This rule applies even when the
+ <codeph>ABORT_ON_ERROR=true</codeph> query option is in effect.) Run small-scale queries using
+ representative data to doublecheck that calls to built-in functions are returning expected values
+ rather than <codeph>NULL</codeph>. For example, unsupported <codeph>CAST</codeph> operations do not
+ raise an error in Impala:
+ </p>
+<codeblock>select cast('foo' as int);
++--------------------+
+| cast('foo' as int) |
++--------------------+
+| NULL |
++--------------------+</codeblock>
+ </li>
+
+ <li>
+ <p>
+ For any other type not supported in Impala, you could represent their values in string format and write
+ UDFs to process them. See <xref href="impala_udf.xml#udfs"/> for details.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ To detect the presence of unsupported or unconvertable data types in data files, do initial testing
+ with the <codeph>ABORT_ON_ERROR=true</codeph> query option in effect. This option causes queries to
+ fail immediately if they encounter disallowed type conversions. See
+ <xref href="impala_abort_on_error.xml#abort_on_error"/> for details. For example:
+ </p>
+<codeblock>set abort_on_error=true;
+select count(*) from (select * from t1);
+-- The above query will fail if the data files for T1 contain any
+-- values that can't be converted to the expected Impala data types.
+-- For example, if T1.C1 is defined as INT but the column contains
+-- floating-point values like 1.1, the query will return an error.</codeblock>
+ </li>
+ </ul>
+ </conbody>
+ </concept>
+
+ <concept id="porting_statements">
+
+ <title>SQL Statements to Remove or Adapt</title>
+
+ <conbody>
+
+ <p>
+ Some SQL statements or clauses that you might be familiar with are not currently supported in Impala:
+ </p>
+
+ <ul>
+ <li>
+ <p>
+ Impala has no <codeph>DELETE</codeph> statement. Impala is intended for data warehouse-style operations
+ where you do bulk moves and transforms of large quantities of data. Instead of using
+ <codeph>DELETE</codeph>, use <codeph>INSERT OVERWRITE</codeph> to entirely replace the contents of a
+ table or partition, or use <codeph>INSERT ... SELECT</codeph> to copy a subset of data (everything but
+ the rows you intended to delete) from one table to another. See <xref href="impala_dml.xml#dml"/> for
+ an overview of Impala DML statements.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Impala has no <codeph>UPDATE</codeph> statement. Impala is intended for data warehouse-style operations
+ where you do bulk moves and transforms of large quantities of data. Instead of using
+ <codeph>UPDATE</codeph>, do all necessary transformations early in the ETL process, such as in the job
+ that generates the original data, or when copying from one table to another to convert to a particular
+ file format or partitioning scheme. See <xref href="impala_dml.xml#dml"/> for an overview of Impala DML
+ statements.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Impala has no transactional statements, such as <codeph>COMMIT</codeph> or <codeph>ROLLBACK</codeph>.
+ Impala effectively works like the <codeph>AUTOCOMMIT</codeph> mode in some database systems, where
+ changes take effect as soon as they are made.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ If your database, table, column, or other names conflict with Impala reserved words, use different
+ names or quote the names with backticks. See <xref href="impala_reserved_words.xml#reserved_words"/>
+ for the current list of Impala reserved words.
+ </p>
+ <p>
+ Conversely, if you use a keyword that Impala does not recognize, it might be interpreted as a table or
+ column alias. For example, in <codeph>SELECT * FROM t1 NATURAL JOIN t2</codeph>, Impala does not
+ recognize the <codeph>NATURAL</codeph> keyword and interprets it as an alias for the table
+ <codeph>t1</codeph>. If you experience any unexpected behavior with queries, check the list of reserved
+ words to make sure all keywords in join and <codeph>WHERE</codeph> clauses are recognized.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Impala supports subqueries only in the <codeph>FROM</codeph> clause of a query, not within the
+ <codeph>WHERE</codeph> clauses. Therefore, you cannot use clauses such as <codeph>WHERE
+ <varname>column</varname> IN (<varname>subquery</varname>)</codeph>. Also, Impala does not allow
+ <codeph>EXISTS</codeph> or <codeph>NOT EXISTS</codeph> clauses (although <codeph>EXISTS</codeph> is a
+ reserved keyword).
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Impala supports <codeph>UNION</codeph> and <codeph>UNION ALL</codeph> set operators, but not
+ <codeph>INTERSECT</codeph>. <ph conref="../shared/impala_common.xml#common/union_all_vs_union"/>
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Within queries, Impala requires query aliases for any subqueries:
+ </p>
+<codeblock>-- Without the alias 'contents_of_t1' at the end, query gives syntax error.
+select count(*) from (select * from t1) contents_of_t1;</codeblock>
+ </li>
+
+ <li>
+ <p>
+ When an alias is declared for an expression in a query, that alias cannot be referenced again within
+ the same query block:
+ </p>
+<codeblock>-- Can't reference AVERAGE twice in the SELECT list where it's defined.
+select avg(x) as average, average+1 from t1 group by x;
+ERROR: AnalysisException: couldn't resolve column reference: 'average'
+
+-- Although it can be referenced again later in the same query.
+select avg(x) as average from t1 group by x having average > 3;</codeblock>
+ <p>
+ For Impala, either repeat the expression again, or abstract the expression into a <codeph>WITH</codeph>
+ clause, creating named columns that can be referenced multiple times anywhere in the base query:
+ </p>
+<codeblock>-- The following 2 query forms are equivalent.
+select avg(x) as average, avg(x)+1 from t1 group by x;
+with avg_t as (select avg(x) average from t1 group by x) select average, average+1 from avg_t;</codeblock>
+<!-- An alternative bunch of queries to use in the example above.
+[localhost:21000] > select x*x as x_squared from t1;
+
+[localhost:21000] > select x*x as x_squared from t1 where x_squared < 100;
+ERROR: AnalysisException: couldn't resolve column reference: 'x_squared'
+[localhost:21000] > select x*x as x_squared, x_squared * pi() as pi_x_squared from t1;
+ERROR: AnalysisException: couldn't resolve column reference: 'x_squared'
+[localhost:21000] > select x*x as x_squared from t1 group by x_squared;
+
+[localhost:21000] > select x*x as x_squared from t1 group by x_squared having x_squared < 100;
+-->
+ </li>
+
+ <li>
+ <p>
+ Impala does not support certain rarely used join types that are less appropriate for high-volume tables
+ used for data warehousing. In some cases, Impala supports join types but requires explicit syntax to
+ ensure you do not do inefficient joins of huge tables by accident. For example, Impala does not support
+ natural joins or anti-joins, and requires the <codeph>CROSS JOIN</codeph> operator for Cartesian
+ products. See <xref href="impala_joins.xml#joins"/> for details on the syntax for Impala join clauses.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ Impala has a limited choice of partitioning types. Partitions are defined based on each distinct
+ combination of values for one or more partition key columns. Impala does not redistribute or check data
+ to create evenly distributed partitions; you must choose partition key columns based on your knowledge
+ of the data volume and distribution. Adapt any tables that use range, list, hash, or key partitioning
+ to use the Impala partition syntax for <codeph>CREATE TABLE</codeph> and <codeph>ALTER TABLE</codeph>
+ statements. Impala partitioning is similar to range partitioning where every range has exactly one
+ value, or key partitioning where the hash function produces a separate bucket for every combination of
+ key values. See <xref href="impala_partitioning.xml#partitioning"/> for usage details, and
+ <xref href="impala_create_table.xml#create_table"/> and
+ <xref href="impala_alter_table.xml#alter_table"/> for syntax.
+ </p>
+ <note>
+ Because the number of separate partitions is potentially higher than in other database systems, keep a
+ close eye on the number of partitions and the volume of data in each one; scale back the number of
+ partition key columns if you end up with too many partitions with a small volume of data in each one.
+ Remember, to distribute work for a query across a cluster, you need at least one HDFS block per node.
+ HDFS blocks are typically multiple megabytes, <ph rev="parquet_block_size">especially</ph> for Parquet
+ files. Therefore, if each partition holds only a few megabytes of data, you are unlikely to see much
+ parallelism in the query because such a small amount of data is typically processed by a single node.
+ </note>
+ </li>
+
+ <li>
+ <p>
+ For <q>top-N</q> queries, Impala uses the <codeph>LIMIT</codeph> clause rather than comparing against a
+ pseudocolumn named <codeph>ROWNUM</codeph> or <codeph>ROW_NUM</codeph>. See
+ <xref href="impala_limit.xml#limit"/> for details.
+ </p>
+ </li>
+ </ul>
+ </conbody>
+ </concept>
+
+ <concept id="porting_antipatterns">
+
+ <title>SQL Constructs to Doublecheck</title>
+
+ <conbody>
+
+ <p>
+ Some SQL constructs that are supported have behavior or defaults more oriented towards convenience than
+ optimal performance. Also, sometimes machine-generated SQL, perhaps issued through JDBC or ODBC
+ applications, might have inefficiencies or exceed internal Impala limits. As you port SQL code, be alert
+ and change these things where appropriate:
+ </p>
+
+ <ul>
+ <li>
+ <p>
+ A <codeph>CREATE TABLE</codeph> statement with no <codeph>STORED AS</codeph> clause creates data files
+ in plain text format, which is convenient for data interchange but not a good choice for high-volume
+ data with high-performance queries. See <xref href="impala_file_formats.xml#file_formats"/> for why and
+ how to use specific file formats for compact data and high-performance queries. Especially see
+ <xref href="impala_parquet.xml#parquet"/>, for details about the file format most heavily optimized for
+ large-scale data warehouse queries.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ A <codeph>CREATE TABLE</codeph> statement with no <codeph>PARTITIONED BY</codeph> clause stores all the
+ data files in the same physical location, which can lead to scalability problems when the data volume
+ becomes large.
+ </p>
+ <p>
+ On the other hand, adapting tables that were already partitioned in a different database system could
+ produce an Impala table with a high number of partitions and not enough data in each one, leading to
+ underutilization of Impala's parallel query features.
+ </p>
+ <p>
+ See <xref href="impala_partitioning.xml#partitioning"/> for details about setting up partitioning and
+ tuning the performance of queries on partitioned tables.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ The <codeph>INSERT ... VALUES</codeph> syntax is suitable for setting up toy tables with a few rows for
+ functional testing, but because each such statement creates a separate tiny file in HDFS, it is not a
+ scalable technique for loading megabytes or gigabytes (let alone petabytes) of data. Consider revising
+ your data load process to produce raw data files outside of Impala, then setting up Impala external
+ tables or using the <codeph>LOAD DATA</codeph> statement to use those data files instantly in Impala
+ tables, with no conversion or indexing stage. See <xref href="impala_tables.xml#external_tables"/> and
+ <xref href="impala_load_data.xml#load_data"/> for details about the Impala techniques for working with
+ data files produced outside of Impala; see <xref href="impala_tutorial.xml#tutorial_etl"/> for examples
+ of ETL workflow for Impala.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ If your ETL process is not optimized for Hadoop, you might end up with highly fragmented small data
+ files, or a single giant data file that cannot take advantage of distributed parallel queries or
+ partitioning. In this case, use an <codeph>INSERT ... SELECT</codeph> statement to copy the data into a
+ new table and reorganize into a more efficient layout in the same operation. See
+ <xref href="impala_insert.xml#insert"/> for details about the <codeph>INSERT</codeph> statement.
+ </p>
+ <p>
+ You can do <codeph>INSERT ... SELECT</codeph> into a table with a more efficient file format (see
+ <xref href="impala_file_formats.xml#file_formats"/>) or from an unpartitioned table into a partitioned
+ one (see <xref href="impala_partitioning.xml#partitioning"/>).
+ </p>
+ </li>
+
+ <li>
+ <p>
+ The number of expressions allowed in an Impala query might be smaller than for some other database
+ systems, causing failures for very complicated queries (typically produced by automated SQL
+ generators). Where practical, keep the number of expressions in the <codeph>WHERE</codeph> clauses to
+ approximately 2000 or fewer. As a workaround, set the query option
+ <codeph>DISABLE_CODEGEN=true</codeph> if queries fail for this reason. See
+ <xref href="impala_disable_codegen.xml#disable_codegen"/> for details.
+ </p>
+ </li>
+
+ <li>
+ <p>
+ If practical, rewrite <codeph>UNION</codeph> queries to use the <codeph>UNION ALL</codeph> operator
+ instead. <ph conref="../shared/impala_common.xml#common/union_all_vs_union"/>
+ </p>
+ </li>
+ </ul>
+ </conbody>
+ </concept>
+
+ <concept id="porting_next">
+
+ <title>Next Porting Steps after Verifying Syntax and Semantics</title>
+
+ <conbody>
+
+ <p>
+ Throughout this section, some of the decisions you make during the porting process also have a substantial
+ impact on performance. After your SQL code is ported and working correctly, doublecheck the
+ performance-related aspects of your schema design, physical layout, and queries to make sure that the
+ ported application is taking full advantage of Impala's parallelism, performance-related SQL features, and
+ integration with Hadoop components.
+ </p>
+
+ <ul>
+ <li>
+ Have you run the <codeph>COMPUTE STATS</codeph> statement on each table involved in join queries? Have
+ you also run <codeph>COMPUTE STATS</codeph> for each table used as the source table in an <codeph>INSERT
+ ... SELECT</codeph> or <codeph>CREATE TABLE AS SELECT</codeph> statement?
+ </li>
+
+ <li>
+ Are you using the most efficient file format for your data volumes, table structure, and query
+ characteristics?
+ </li>
+
+ <li>
+ Are you using partitioning effectively? That is, have you partitioned on columns that are often used for
+ filtering in <codeph>WHERE</codeph> clauses? Have you partitioned at the right granularity so that there
+ is enough data in each partition to parallelize the work for each query?
+ </li>
+
+ <li>
+ Does your ETL process produce a relatively small number of multi-megabyte data files (good) rather than a
+ huge number of small files (bad)?
+ </li>
+ </ul>
+
+ <p>
+ See <xref href="impala_performance.xml#performance"/> for details about the whole performance tuning
+ process.
+ </p>
+ </conbody>
+ </concept>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_query_options.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_query_options.xml b/docs/topics/impala_query_options.xml
new file mode 100644
index 0000000..1011746
--- /dev/null
+++ b/docs/topics/impala_query_options.xml
@@ -0,0 +1,75 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="query_options">
+
+ <title>Query Options for the SET Statement</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala Query Options"/>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="impala-shell"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Configuring"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ You can specify the following options using the <codeph>SET</codeph> statement, and those settings affect all
+ queries issued from that session.
+ </p>
+
+ <p>
+ Some query options are useful in day-to-day operations for improving usability, performance, or flexibility.
+ </p>
+
+ <p>
+ Other query options control special-purpose aspects of Impala operation and are intended primarily for
+ advanced debugging or troubleshooting.
+ </p>
+
+ <p>
+ Options with Boolean parameters can be set to 1 or <codeph>true</codeph> to enable, or 0 or <codeph>false</codeph>
+ to turn off.
+ </p>
+
+ <note rev="2.0.0">
+ In Impala 2.0 and later, you can set query options directly through the JDBC and ODBC interfaces by using the
+ <codeph>SET</codeph> statement. Formerly, <codeph>SET</codeph> was only available as a command within the
+ <cmdname>impala-shell</cmdname> interpreter.
+ </note>
+
+<!-- This is the list including defaults from the pre-release 1.2 impala-shell:
+ ABORT_ON_DEFAULT_LIMIT_EXCEEDED: 0
+ ABORT_ON_ERROR: 0
+ ALLOW_UNSUPPORTED_FORMATS: 0
+ BATCH_SIZE: 0
+ DEBUG_ACTION:
+ DEFAULT_ORDER_BY_LIMIT: -1
+ DISABLE_CODEGEN: 0
+ HBASE_CACHE_BLOCKS: 0
+ HBASE_CACHING: 0
+ MAX_ERRORS: 0
+ MAX_IO_BUFFERS: 0
+ MAX_SCAN_RANGE_LENGTH: 0
+ MEM_LIMIT: 0
+ NUM_NODES: 0
+ NUM_SCANNER_THREADS: 0
+ PARQUET_COMPRESSION_CODEC: SNAPPY
+ PARQUET_FILE_SIZE: 0
+ SUPPORT_START_OVER: false
+-->
+
+ <p outputclass="toc"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_set.xml#set"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_query_timeout_s.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_query_timeout_s.xml b/docs/topics/impala_query_timeout_s.xml
new file mode 100644
index 0000000..41f2918
--- /dev/null
+++ b/docs/topics/impala_query_timeout_s.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="2.0.0" id="query_timeout_s">
+
+ <title>QUERY_TIMEOUT_S Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">QUERY_TIMEOUT_S query option</indexterm>
+ Sets the idle query timeout value for the session, in seconds. Queries that sit idle for longer than the
+ timeout value are automatically cancelled. If the system administrator specified the
+ <codeph>--idle_query_timeout</codeph> startup option, <codeph>QUERY_TIMEOUT_S</codeph> must be smaller than
+ or equal to the <codeph>--idle_query_timeout</codeph> value.
+ </p>
+
+ <note conref="../shared/impala_common.xml#common/timeout_clock_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>SET QUERY_TIMEOUT_S=<varname>seconds</varname>;</codeblock>
+
+<!-- Don't have a compelling example to show at this time because the 'idle' aspect only applies
+ when the client is careless and leaves the query open. Can't easily demonstrate in impala-shell.
+
+ <p conref="/Content/impala_common_xi44078.xml#common/example_blurb"/>
+-->
+
+ <p>
+ <b>Type:</b> numeric
+ </p>
+
+ <p>
+ <b>Default:</b> 0 (no timeout if <codeph>--idle_query_timeout</codeph> not in effect; otherwise, use
+ <codeph>--idle_query_timeout</codeph> value)
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_timeouts.xml#timeouts"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_real.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_real.xml b/docs/topics/impala_real.xml
new file mode 100644
index 0000000..e6430e3
--- /dev/null
+++ b/docs/topics/impala_real.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="real">
+
+ <title>REAL Data Type</title>
+ <titlealts><navtitle>REAL</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Data Types"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ An alias for the <codeph>DOUBLE</codeph> data type. See <xref href="impala_double.xml#double"/> for details.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ These examples show how you can use the type names <codeph>REAL</codeph> and <codeph>DOUBLE</codeph>
+ interchangeably, and behind the scenes Impala treats them always as <codeph>DOUBLE</codeph>.
+ </p>
+
+<codeblock>[localhost:21000] > create table r1 (x real);
+[localhost:21000] > describe r1;
++------+--------+---------+
+| name | type | comment |
++------+--------+---------+
+| x | double | |
++------+--------+---------+
+[localhost:21000] > insert into r1 values (1.5), (cast (2.2 as double));
+[localhost:21000] > select cast (1e6 as real);
++---------------------------+
+| cast(1000000.0 as double) |
++---------------------------+
+| 1000000 |
++---------------------------+</codeblock>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_refresh.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_refresh.xml b/docs/topics/impala_refresh.xml
new file mode 100644
index 0000000..ee022d5
--- /dev/null
+++ b/docs/topics/impala_refresh.xml
@@ -0,0 +1,234 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="refresh">
+
+ <title>REFRESH Statement</title>
+ <titlealts><navtitle>REFRESH</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="DDL"/>
+ <data name="Category" value="Tables"/>
+ <data name="Category" value="Hive"/>
+ <data name="Category" value="Metastore"/>
+ <data name="Category" value="ETL"/>
+ <data name="Category" value="Ingest"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">REFRESH statement</indexterm>
+ To accurately respond to queries, the Impala node that acts as the coordinator (the node to which you are
+ connected through <cmdname>impala-shell</cmdname>, JDBC, or ODBC) must have current metadata about those
+ databases and tables that are referenced in Impala queries. If you are not familiar with the way Impala uses
+ metadata and how it shares the same metastore database as Hive, see
+ <xref href="impala_hadoop.xml#intro_metastore"/> for background information.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>REFRESH [<varname>db_name</varname>.]<varname>table_name</varname></codeblock>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ Use the <codeph>REFRESH</codeph> statement to load the latest metastore metadata and block location data for
+ a particular table in these scenarios:
+ </p>
+
+ <ul>
+ <li>
+ After loading new data files into the HDFS data directory for the table. (Once you have set up an ETL
+ pipeline to bring data into Impala on a regular basis, this is typically the most frequent reason why
+ metadata needs to be refreshed.)
+ </li>
+
+ <li>
+ After issuing <codeph>ALTER TABLE</codeph>, <codeph>INSERT</codeph>, <codeph>LOAD DATA</codeph>, or other
+ table-modifying SQL statement in Hive.
+ </li>
+ </ul>
+
+ <p>
+ You only need to issue the <codeph>REFRESH</codeph> statement on the node to which you connect to issue
+ queries. The coordinator node divides the work among all the Impala nodes in a cluster, and sends read
+ requests for the correct HDFS blocks without relying on the metadata on the other nodes.
+ </p>
+
+ <p>
+ <codeph>REFRESH</codeph> reloads the metadata for the table from the metastore database, and does an
+ incremental reload of the low-level block location data to account for any new data files added to the HDFS
+ data directory for the table. It is a low-overhead, single-table operation, specifically tuned for the common
+ scenario where new data files are added to HDFS.
+ </p>
+
+ <p>
+ Only the metadata for the specified table is flushed. The table must already exist and be known to Impala,
+ either because the <codeph>CREATE TABLE</codeph> statement was run in Impala rather than Hive, or because a
+ previous <codeph>INVALIDATE METADATA</codeph> statement caused Impala to reload its entire metadata catalog.
+ </p>
+
+ <note>
+ <p rev="1.2">
+ In Impala 1.2 and higher, the catalog service broadcasts any changed metadata as a result of Impala
+ <codeph>ALTER TABLE</codeph>, <codeph>INSERT</codeph> and <codeph>LOAD DATA</codeph> statements to all
+ Impala nodes. Thus, the <codeph>REFRESH</codeph> statement is only required if you load data through Hive
+ or by manipulating data files in HDFS directly. See <xref href="impala_components.xml#intro_catalogd"/> for
+ more information on the catalog service.
+ </p>
+ <p rev="1.2.1">
+ In Impala 1.2.1 and higher, another way to avoid inconsistency across nodes is to enable the
+ <codeph>SYNC_DDL</codeph> query option before performing a DDL statement or an <codeph>INSERT</codeph> or
+ <codeph>LOAD DATA</codeph>.
+ </p>
+ <p>
+ The functionality of the <codeph>REFRESH</codeph> statement has changed in Impala 1.1 and higher. Now the
+ table name is a required parameter. To flush the metadata for all tables, use the
+ <codeph><xref href="impala_invalidate_metadata.xml#invalidate_metadata">INVALIDATE METADATA</xref></codeph>
+ command.
+ </p>
+ <draft-comment translate="no"> Almost-identical wording here, under INVALIDATE METADATA, and in Release Notes :: New Features. Makes sense to conref. </draft-comment>
+ <p>
+ Because <codeph>REFRESH <varname>table_name</varname></codeph> only works for tables that Impala is already
+ aware of, when you create a new table in the Hive shell, you must enter <codeph>INVALIDATE
+ METADATA</codeph> with no table parameter before you can see the new table in
+ <cmdname>impala-shell</cmdname>. Once the table is known to Impala, you can issue <codeph>REFRESH
+ <varname>table_name</varname></codeph> as needed after you add more data files for that table.
+ </p>
+ </note>
+
+ <p conref="../shared/impala_common.xml#common/refresh_vs_invalidate"/>
+
+ <p>
+ A metadata update for an <codeph>impalad</codeph> instance <b>is</b> required if:
+ </p>
+
+ <ul>
+ <li>
+ A metadata change occurs.
+ </li>
+
+ <li>
+ <b>and</b> the change is made through Hive.
+ </li>
+
+ <li>
+ <b>and</b> the change is made to a database to which clients such as the Impala shell or ODBC directly
+ connect.
+ </li>
+ </ul>
+
+ <p rev="1.2">
+ A metadata update for an Impala node is <b>not</b> required after you run <codeph>ALTER TABLE</codeph>,
+ <codeph>INSERT</codeph>, or other table-modifying statement in Impala rather than Hive. Impala handles the
+ metadata synchronization automatically through the catalog service.
+ </p>
+
+ <p>
+ Database and table metadata is typically modified by:
+ </p>
+
+ <ul>
+ <li>
+ Hive - through <codeph>ALTER</codeph>, <codeph>CREATE</codeph>, <codeph>DROP</codeph> or
+ <codeph>INSERT</codeph> operations.
+ </li>
+
+ <li>
+ Impalad - through <codeph>CREATE TABLE</codeph>, <codeph>ALTER TABLE</codeph>, and <codeph>INSERT</codeph>
+ operations. <ph rev="1.2">In Impala 1.2 and higher, such changes are propagated to all Impala nodes by the
+ Impala catalog service.</ph>
+ </li>
+ </ul>
+
+ <p>
+ <codeph>REFRESH</codeph> causes the metadata for that table to be immediately reloaded. For a huge table,
+ that process could take a noticeable amount of time; but doing the refresh up front avoids an unpredictable
+ delay later, for example if the next reference to the table is during a benchmark test.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example shows how you might use the <codeph>REFRESH</codeph> statement after manually adding
+ new HDFS data files to the Impala data directory for a table:
+ </p>
+
+<codeblock>[impalad-host:21000] > refresh t1;
+[impalad-host:21000] > refresh t2;
+[impalad-host:21000] > select * from t1;
+...
+[impalad-host:21000] > select * from t2;
+... </codeblock>
+
+ <p>
+ For more examples of using <codeph>REFRESH</codeph> and <codeph>INVALIDATE METADATA</codeph> with a
+ combination of Impala and Hive operations, see <xref href="impala_tutorial.xml#tutorial_impala_hive"/>.
+ </p>
+
+ <p>
+ <b>Related impalad options:</b>
+ </p>
+
+ <p>
+ In Impala 1.0, the <codeph>-r</codeph> option of <cmdname>impala-shell</cmdname> issued
+ <codeph>REFRESH</codeph> to reload metadata for all tables.
+ </p>
+
+ <p>
+ In Impala 1.1 and higher, this option issues <codeph>INVALIDATE METADATA</codeph> because
+ <codeph>REFRESH</codeph> now requires a table name parameter. Due to the expense of reloading the metadata
+ for all tables, the <cmdname>impala-shell</cmdname> <codeph>-r</codeph> option is not recommended for
+ day-to-day use in a production environment.
+ </p>
+
+ <p rev="1.2">
+ In Impala 1.2 and higher, the <codeph>-r</codeph> option is needed even less frequently, because metadata
+ changes caused by SQL statements in Impala are automatically broadcast to all Impala nodes.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
+ <p rev="CDH-19187">
+ The user ID that the <cmdname>impalad</cmdname> daemon runs under,
+ typically the <codeph>impala</codeph> user, must have execute
+ permissions for all the relevant directories holding table data.
+ (A table could have data spread across multiple directories,
+ or in unexpected paths, if it uses partitioning or
+ specifies a <codeph>LOCATION</codeph> attribute for
+ individual partitions or the entire table.)
+ Issues with permissions might not cause an immediate error for this statement,
+ but subsequent statements such as <codeph>SELECT</codeph>
+ or <codeph>SHOW TABLE STATS</codeph> could fail.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/hdfs_blurb"/>
+
+ <p>
+ The <codeph>REFRESH</codeph> command checks HDFS permissions of the underlying data files and directories,
+ caching this information so that a statement can be cancelled immediately if for example the
+ <codeph>impala</codeph> user does not have permission to write to the data directory for the table. Impala
+ reports any lack of write permissions as an <codeph>INFO</codeph> message in the log file, in case that
+ represents an oversight. If you change HDFS permissions to make data readable or writeable by the Impala
+ user, issue another <codeph>REFRESH</codeph> to make Impala aware of the change.
+ </p>
+
+ <note conref="../shared/impala_common.xml#common/compute_stats_next"/>
+
+ <p conref="../shared/impala_common.xml#common/s3_blurb"/>
+ <p conref="../shared/impala_common.xml#common/s3_metadata"/>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+ <p>
+ <xref href="impala_hadoop.xml#intro_metastore"/>,
+ <xref href="impala_invalidate_metadata.xml#invalidate_metadata"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_request_pool.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_request_pool.xml b/docs/topics/impala_request_pool.xml
new file mode 100644
index 0000000..cf2a811
--- /dev/null
+++ b/docs/topics/impala_request_pool.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.3.0" id="request_pool">
+
+ <title>REQUEST_POOL Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Resource Management"/>
+ <data name="Category" value="Impala Query Options"/>
+ <data name="Category" value="Admission Control"/>
+ <data name="Category" value="YARN"/>
+ <data name="Category" value="Llama"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">REQUEST_POOL query option</indexterm>
+ The pool or queue name that queries should be submitted to. Only applies when you enable the Impala admission
+ control feature (CDH 4 or CDH 5; see <xref href="impala_admission.xml#admission_control"/>), or the YARN
+ resource management feature (CDH 5 only; see
+ <xref href="impala_resource_management.xml#resource_management"/>). Specifies the name of the pool used by
+ requests from Impala to the resource manager.
+ </p>
+
+ <p>
+ Formerly known as <codeph>YARN_POOL</codeph> during the CDH 5 beta period. Renamed to reflect that it can be
+ used both with YARN and with the lightweight admission control feature introduced in Impala 1.3.
+ </p>
+
+ <p>
+ <b>Type:</b> <codeph>STRING</codeph>
+ </p>
+
+ <p>
+ <b>Default:</b> empty (use the user-to-pool mapping defined by an <cmdname>impalad</cmdname> startup option
+ in the Impala configuration file)
+ </p>
+
+<!-- Worth adding a couple of related info links here. -->
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_reservation_request_timeout.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_reservation_request_timeout.xml b/docs/topics/impala_reservation_request_timeout.xml
new file mode 100644
index 0000000..0316e44
--- /dev/null
+++ b/docs/topics/impala_reservation_request_timeout.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.2" id="reservation_request_timeout">
+
+ <title>RESERVATION_REQUEST_TIMEOUT Query Option (CDH 5 only)</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ <data name="Category" value="Resource Management"/>
+ <data name="Category" value="YARN"/>
+ <data name="Category" value="Llama"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">RESERVATION_REQUEST_TIMEOUT query option</indexterm>
+ Maximum number of milliseconds Impala will wait for a reservation to be completely granted or denied. Used in
+ conjunction with the Impala resource management feature in Impala 1.2 and higher with CDH 5.
+ </p>
+
+ <p>
+ <b>Type:</b> numeric
+ </p>
+
+ <p>
+ <b>Default:</b> 300000 (5 minutes)
+ </p>
+
+<!-- Worth adding a couple of related info links here. -->
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_revoke.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_revoke.xml b/docs/topics/impala_revoke.xml
new file mode 100644
index 0000000..88fbbf9
--- /dev/null
+++ b/docs/topics/impala_revoke.xml
@@ -0,0 +1,96 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="2.0.0" id="revoke">
+
+ <title>REVOKE Statement (CDH 5.2 or higher only)</title>
+ <titlealts><navtitle>REVOKE (CDH 5.2 or higher only)</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="DDL"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Sentry"/>
+ <data name="Category" value="Roles"/>
+ <!-- Consider whether to go deeper into categories like Security for the Sentry-related statements. -->
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">REVOKE statement</indexterm>
+<!-- Copied from Sentry docs. Turn into conref. I did some rewording for clarity. -->
+ The <codeph>REVOKE</codeph> statement revokes roles or privileges on a specified object from groups. Only
+ Sentry administrative users can revoke the role from a group. The revocation has a cascading effect. For
+ example, revoking the <codeph>ALL</codeph> privilege on a database also revokes the same privilege for all
+ the tables in that database.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock rev="2.3.0 collevelauth">REVOKE ROLE <varname>role_name</varname> FROM GROUP <varname>group_name</varname>
+
+REVOKE <varname>privilege</varname> ON <varname>object_type</varname> <varname>object_name</varname>
+ FROM [ROLE] <varname>role_name</varname>
+
+<ph rev="2.3.0">privilege ::= SELECT | SELECT(<varname>column_name</varname>) | INSERT | ALL</ph>
+object_type ::= TABLE | DATABASE | SERVER | URI
+</codeblock>
+
+ <p>
+ Typically, the object name is an identifier. For URIs, it is a string literal.
+ </p>
+
+ <p rev="2.3.0 collevelauth">
+ The ability to grant or revoke <codeph>SELECT</codeph> privilege on specific columns is available
+ in CDH 5.5 / Impala 2.3 and higher. See <xref href="sg_hive_sql.xml#concept_c2q_4qx_p4/col_level_auth_sentry"/>
+ for details.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/privileges_blurb"/>
+
+ <p>
+ Only administrative users (those with <codeph>ALL</codeph> privileges on the server, defined in the Sentry
+ policy file) can use this statement.
+ </p>
+
+<!-- Turn compatibility info into a conref or series of conrefs. (In both GRANT and REVOKE.) -->
+
+ <p conref="../shared/impala_common.xml#common/compatibility_blurb"/>
+
+ <p>
+ <ul>
+ <li>
+ The Impala <codeph>GRANT</codeph> and <codeph>REVOKE</codeph> statements are available in CDH 5.2 and
+ higher.
+ </li>
+
+ <li>
+ In CDH 5.1 and higher, Impala makes use of any roles and privileges specified by the
+ <codeph>GRANT</codeph> and <codeph>REVOKE</codeph> statements in Hive, when your system is configured to
+ use the Sentry service instead of the file-based policy mechanism.
+ </li>
+
+ <li>
+ The Impala <codeph>GRANT</codeph> and <codeph>REVOKE</codeph> statements do not require the
+ <codeph>ROLE</codeph> keyword to be repeated before each role name, unlike the equivalent Hive
+ statements.
+ </li>
+
+ <li conref="../shared/impala_common.xml#common/grant_revoke_single"/>
+ </ul>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_authorization.xml#authorization"/>, <xref href="impala_grant.xml#grant"/>
+ <xref href="impala_create_role.xml#create_role"/>, <xref href="impala_drop_role.xml#drop_role"/>,
+ <xref href="impala_show.xml#show"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_schema_objects.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_schema_objects.xml b/docs/topics/impala_schema_objects.xml
new file mode 100644
index 0000000..d8abe12
--- /dev/null
+++ b/docs/topics/impala_schema_objects.xml
@@ -0,0 +1,57 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="schema_objects">
+
+ <title>Impala Schema Objects and Object Names</title>
+ <titlealts><navtitle>Schema Objects and Object Names</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">schema objects</indexterm>
+ With Impala, you work with schema objects that are familiar to database users: primarily databases, tables, views,
+ and functions. The SQL syntax to work with these objects is explained in
+ <xref href="impala_langref_sql.xml#langref_sql"/>. This section explains the conceptual knowledge you need to
+ work with these objects and the various ways to specify their names.
+ </p>
+
+ <p>
+ Within a table, partitions can also be considered a kind of object. Partitioning is an important subject for
+ Impala, with its own documentation section covering use cases and performance considerations. See
+ <xref href="impala_partitioning.xml#partitioning"/> for details.
+ </p>
+
+ <p>
+ Impala does not have a counterpart of the <q>tablespace</q> notion from some database systems. By default,
+ all the data files for a database, table, or partition are located within nested folders within the HDFS file
+ system. You can also specify a particular HDFS location for a given Impala table or partition. The raw data
+ for these objects is represented as a collection of data files, providing the flexibility to load data by
+ simply moving files into the expected HDFS location.
+ </p>
+
+ <p>
+ Information about the schema objects is held in the
+ <xref href="impala_hadoop.xml#intro_metastore">metastore</xref> database. This database is shared between
+ Impala and Hive, allowing each to create, drop, and query each other's databases, tables, and so on. When
+ Impala makes a change to schema objects through a <codeph>CREATE</codeph>, <codeph>ALTER</codeph>,
+ <codeph>DROP</codeph>, <codeph>INSERT</codeph>, or <codeph>LOAD DATA</codeph> statement, it broadcasts those
+ changes to all nodes in the cluster through the <xref href="impala_components.xml#intro_catalogd">catalog
+ service</xref>. When you make such changes through Hive or directly through manipulating HDFS files, you use
+ the <xref href="impala_refresh.xml#refresh">REFRESH</xref> or
+ <xref href="impala_invalidate_metadata.xml#invalidate_metadata">INVALIDATE METADATA</xref> statements on the
+ Impala side to recognize the newly loaded data, new tables, and so on.
+ </p>
+
+ <p outputclass="toc"/>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_select.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_select.xml b/docs/topics/impala_select.xml
new file mode 100644
index 0000000..db63f71
--- /dev/null
+++ b/docs/topics/impala_select.xml
@@ -0,0 +1,203 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="select">
+
+ <title>SELECT Statement</title>
+ <titlealts><navtitle>SELECT</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Reports"/>
+ <data name="Category" value="Tables"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <!-- This is such an important statement, think if there are more applicable categories. -->
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">SELECT statement</indexterm>
+ The <codeph>SELECT</codeph> statement performs queries, retrieving data from one or more tables and producing
+ result sets consisting of rows and columns.
+ </p>
+
+ <p>
+ The Impala <codeph><xref href="impala_insert.xml#insert">INSERT</xref></codeph> statement also typically ends
+ with a <codeph>SELECT</codeph> statement, to define data to copy from one table to another.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>[WITH <i>name</i> AS (<i>select_expression</i>) [, ...] ]
+SELECT
+ [ALL | DISTINCT]
+ [STRAIGHT_JOIN]
+ <i>expression</i> [, <i>expression</i> ...]
+FROM <i>table_reference</i> [, <i>table_reference</i> ...]
+[[FULL | [LEFT | RIGHT] INNER | [LEFT | RIGHT] OUTER | [LEFT | RIGHT] SEMI | [LEFT | RIGHT] ANTI | CROSS]
+ JOIN <i>table_reference</i>
+ [ON <i>join_equality_clauses</i> | USING (<varname>col1</varname>[, <varname>col2</varname> ...]] ...
+WHERE <i>conditions</i>
+GROUP BY { <i>column</i> | <i>expression</i> [ASC | DESC] [NULLS FIRST | NULLS LAST] [, ...] }
+HAVING <codeph>conditions</codeph>
+GROUP BY { <i>column</i> | <i>expression</i> [ASC | DESC] [, ...] }
+LIMIT <i>expression</i> [OFFSET <i>expression</i>]
+[UNION [ALL] <i>select_statement</i>] ...]
+</codeblock>
+
+ <p>
+ Impala <codeph>SELECT</codeph> queries support:
+ </p>
+
+ <ul>
+ <li>
+ SQL scalar data types: <codeph><xref href="impala_boolean.xml#boolean">BOOLEAN</xref></codeph>,
+ <codeph><xref href="impala_tinyint.xml#tinyint">TINYINT</xref></codeph>,
+ <codeph><xref href="impala_smallint.xml#smallint">SMALLINT</xref></codeph>,
+ <codeph><xref href="impala_int.xml#int">INT</xref></codeph>,
+ <codeph><xref href="impala_bigint.xml#bigint">BIGINT</xref></codeph>,
+ <codeph><xref href="impala_decimal.xml#decimal">DECIMAL</xref></codeph>
+ <codeph><xref href="impala_float.xml#float">FLOAT</xref></codeph>,
+ <codeph><xref href="impala_double.xml#double">DOUBLE</xref></codeph>,
+ <codeph><xref href="impala_timestamp.xml#timestamp">TIMESTAMP</xref></codeph>,
+ <codeph><xref href="impala_string.xml#string">STRING</xref></codeph>,
+ <codeph><xref href="impala_varchar.xml#varchar">VARCHAR</xref></codeph>,
+ <codeph><xref href="impala_char.xml#char">CHAR</xref></codeph>.
+ </li>
+
+<!-- To do: Consider promoting 'querying complex types' to its own subtopic or pseudo-heading. -->
+ <li rev="2.3.0">
+ The complex data types <codeph>ARRAY</codeph>, <codeph>STRUCT</codeph>, and <codeph>MAP</codeph>,
+ are available in CDH 5.5 / Impala 2.3 and higher.
+ Queries involving these types typically involve special qualified names
+ using dot notation for referring to the complex column fields,
+ and join clauses for bringing the complex columns into the result set.
+ See <xref href="impala_complex_types.xml#complex_types"/> for details.
+ </li>
+
+ <li rev="1.1">
+ An optional <xref href="impala_with.xml#with"><codeph>WITH</codeph> clause</xref> before the
+ <codeph>SELECT</codeph> keyword, to define a subquery whose name or column names can be referenced from
+ later in the main query. This clause lets you abstract repeated clauses, such as aggregation functions,
+ that are referenced multiple times in the same query.
+ </li>
+
+ <li>
+ By default, one <codeph>DISTINCT</codeph> clause per query. See <xref href="impala_distinct.xml#distinct"/>
+ for details. See <xref href="impala_appx_count_distinct.xml#appx_count_distinct"/> for a query option to
+ allow multiple <codeph>COUNT(DISTINCT)</codeph> impressions in the same query.
+ </li>
+
+ <li>
+ Subqueries in a <codeph>FROM</codeph> clause. In CDH 5.2 / Impala 2.0 and higher,
+ subqueries can also go in the <codeph>WHERE</codeph> clause, for example with the
+ <codeph>IN()</codeph>, <codeph>EXISTS</codeph>, and <codeph>NOT EXISTS</codeph> operators.
+ </li>
+
+ <li>
+ <codeph>WHERE</codeph>, <codeph>GROUP BY</codeph>, <codeph>HAVING</codeph> clauses.
+ </li>
+
+ <li rev="obwl">
+ <codeph><xref href="impala_order_by.xml#order_by">ORDER BY</xref></codeph>. Prior to Impala 1.4.0, Impala
+ required that queries using an <codeph>ORDER BY</codeph> clause also include a
+ <codeph><xref href="impala_limit.xml#limit">LIMIT</xref></codeph> clause. In Impala 1.4.0 and higher, this
+ restriction is lifted; sort operations that would exceed the Impala memory limit automatically use a
+ temporary disk work area to perform the sort.
+ </li>
+
+ <li>
+ <p conref="../shared/impala_common.xml#common/join_types"/>
+ <p>
+ See <xref href="impala_joins.xml#joins"/> for details and examples of join queries.
+ </p>
+ </li>
+
+ <li>
+ <codeph>UNION ALL</codeph>.
+ </li>
+
+ <li>
+ <codeph>LIMIT</codeph>.
+ </li>
+
+ <li>
+ External tables.
+ </li>
+
+ <li>
+ Relational operators such as greater than, less than, or equal to.
+ </li>
+
+ <li>
+ Arithmetic operators such as addition or subtraction.
+ </li>
+
+ <li>
+ Logical/Boolean operators <codeph>AND</codeph>, <codeph>OR</codeph>, and <codeph>NOT</codeph>. Impala does
+ not support the corresponding symbols <codeph>&&</codeph>, <codeph>||</codeph>, and
+ <codeph>!</codeph>.
+ </li>
+
+ <li>
+ Common SQL built-in functions such as <codeph>COUNT</codeph>, <codeph>SUM</codeph>, <codeph>CAST</codeph>,
+ <codeph>LIKE</codeph>, <codeph>IN</codeph>, <codeph>BETWEEN</codeph>, and <codeph>COALESCE</codeph>. Impala
+ specifically supports built-ins described in <xref href="impala_functions.xml#builtins"/>.
+ </li>
+ </ul>
+
+ <p conref="../shared/impala_common.xml#common/ignore_file_extensions"/>
+
+ <p conref="../shared/impala_common.xml#common/security_blurb"/>
+ <p conref="../shared/impala_common.xml#common/redaction_yes"/>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_yes"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
+ <p rev="CDH-19187">
+ The user ID that the <cmdname>impalad</cmdname> daemon runs under,
+ typically the <codeph>impala</codeph> user, must have read
+ permissions for the files in all applicable directories in all source tables,
+ and read and execute permissions for the relevant data directories.
+ (A <codeph>SELECT</codeph> operation could read files from multiple different HDFS directories
+ if the source table is partitioned.)
+ If a query attempts to read a data file and is unable to because of an HDFS permission error,
+ the query halts and does not return any further results.
+ </p>
+
+ <p outputclass="toc"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ The <codeph>SELECT</codeph> syntax is so extensive that it forms its own category of statements: queries. The
+ other major classifications of SQL statements are data definition language (see
+ <xref href="impala_ddl.xml#ddl"/>) and data manipulation language (see <xref href="impala_dml.xml#dml"/>).
+ </p>
+
+ <p>
+ Because the focus of Impala is on fast queries with interactive response times over huge data sets, query
+ performance and scalability are important considerations. See
+ <xref href="impala_performance.xml#performance"/> and <xref href="impala_scalability.xml#scalability"/> for
+ details.
+ </p>
+ </conbody>
+
+ <concept id="where" audience="Cloudera">
+
+<!-- WHERE hidden for the moment until there's the chance to add some reasonably comprehensive content
+
+ and make it its own file. -->
+
+ <title>WHERE Clause</title>
+
+ <conbody>
+
+ <p/>
+ </conbody>
+ </concept>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_set.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_set.xml b/docs/topics/impala_set.xml
new file mode 100644
index 0000000..afa6777
--- /dev/null
+++ b/docs/topics/impala_set.xml
@@ -0,0 +1,90 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="2.0.0" id="set">
+
+ <title>SET Statement</title>
+ <titlealts><navtitle>SET</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Configuring"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">SET statement</indexterm>
+ Specifies values for query options that control the runtime behavior of other statements within the same
+ session.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>SET [<varname>query_option</varname>=<varname>option_value</varname>]
+</codeblock>
+
+ <p>
+ <codeph>SET</codeph> with no arguments returns a result set consisting of all available query options and
+ their current values.
+ </p>
+
+ <p>
+ The query option name and any string argument values are case-insensitive.
+ </p>
+
+ <p>
+ Each query option has a specific allowed notation for its arguments. Boolean options can be enabled and
+ disabled by assigning values of either <codeph>true</codeph> and <codeph>false</codeph>, or
+ <codeph>1</codeph> and <codeph>0</codeph>. Some numeric options accept a final character signifying the unit,
+ such as <codeph>2g</codeph> for 2 gigabytes or <codeph>100m</codeph> for 100 megabytes. See
+ <xref href="impala_query_options.xml#query_options"/> for the details of each query option.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ <codeph>MEM_LIMIT</codeph> is probably the most commonly used query option. You can specify a high value to
+ allow a resource-intensive query to complete. For testing how queries would work on memory-constrained
+ systems, you might specify an artificially low value.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example sets some numeric and some Boolean query options to control usage of memory, disk
+ space, and timeout periods, then runs a query whose success could depend on the options in effect:
+ </p>
+
+<codeblock>set mem_limit=64g;
+set DISABLE_UNSAFE_SPILLS=true;
+set parquet_file_size=400m;
+set RESERVATION_REQUEST_TIMEOUT=900000;
+insert overwrite parquet_table select c1, c2, count(c3) from text_table group by c1, c2, c3;
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/added_in_20"/>
+
+ <p>
+ <codeph>SET</codeph> has always been available as an <cmdname>impala-shell</cmdname> command. Promoting it to
+ a SQL statement lets you use this feature in client applications through the JDBC and ODBC APIs.
+ </p>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/jdbc_blurb"/> -->
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ See <xref href="impala_query_options.xml#query_options"/> for the query options you can adjust using this
+ statement.
+ </p>
+ </conbody>
+</concept>
[08/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Posted by jr...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_max.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_max.xml b/docs/topics/impala_max.xml
new file mode 100644
index 0000000..b989785
--- /dev/null
+++ b/docs/topics/impala_max.xml
@@ -0,0 +1,192 @@
+<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="max">
+
+ <title>MAX Function</title>
+ <titlealts><navtitle>MAX</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="Analytic Functions"/>
+ <data name="Category" value="Aggregate Functions"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">max() function</indexterm>
+ An aggregate function that returns the maximum value from a set of numbers. Opposite of the
+ <codeph>MIN</codeph> function. Its single argument can be numeric column, or the numeric result of a function
+ or expression applied to the column value. Rows with a <codeph>NULL</codeph> value for the specified column
+ are ignored. If the table is empty, or all the values supplied to <codeph>MAX</codeph> are
+ <codeph>NULL</codeph>, <codeph>MAX</codeph> returns <codeph>NULL</codeph>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>MAX([DISTINCT | ALL] <varname>expression</varname>) [OVER (<varname>analytic_clause</varname>)]</codeblock>
+
+ <p>
+ When the query contains a <codeph>GROUP BY</codeph> clause, returns one value for each combination of
+ grouping values.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_sliding_window"/>
+
+ <p conref="../shared/impala_common.xml#common/return_type_same_except_string"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_aggregation_explanation"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_aggregation_example"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>-- Find the largest value for this column in the table.
+select max(c1) from t1;
+-- Find the largest value for this column from a subset of the table.
+select max(c1) from t1 where month = 'January' and year = '2013';
+-- Find the largest value from a set of numeric function results.
+select max(length(s)) from t1;
+-- Can also be used in combination with DISTINCT and/or GROUP BY.
+-- Return more than one result.
+select month, year, max(purchase_price) from store_stats group by month, year;
+-- Filter the input to eliminate duplicates before performing the calculation.
+select max(distinct x) from t1;
+</codeblock>
+
+ <p rev="2.0.0">
+ The following examples show how to use <codeph>MAX()</codeph> in an analytic context. They use a table
+ containing integers from 1 to 10. Notice how the <codeph>MAX()</codeph> is reported for each input value, as
+ opposed to the <codeph>GROUP BY</codeph> clause which condenses the result set.
+<codeblock>select x, property, max(x) over (partition by property) as max from int_t where property in ('odd','even');
++----+----------+-----+
+| x | property | max |
++----+----------+-----+
+| 2 | even | 10 |
+| 4 | even | 10 |
+| 6 | even | 10 |
+| 8 | even | 10 |
+| 10 | even | 10 |
+| 1 | odd | 9 |
+| 3 | odd | 9 |
+| 5 | odd | 9 |
+| 7 | odd | 9 |
+| 9 | odd | 9 |
++----+----------+-----+
+</codeblock>
+
+Adding an <codeph>ORDER BY</codeph> clause lets you experiment with results that are cumulative or apply to a moving
+set of rows (the <q>window</q>). The following examples use <codeph>MAX()</codeph> in an analytic context
+(that is, with an <codeph>OVER()</codeph> clause) to display the smallest value of <codeph>X</codeph>
+encountered up to each row in the result set. The examples use two columns in the <codeph>ORDER BY</codeph>
+clause to produce a sequence of values that rises and falls, to illustrate how the <codeph>MAX()</codeph>
+result only increases or stays the same throughout each partition within the result set.
+The basic <codeph>ORDER BY x</codeph> clause implicitly
+activates a window clause of <codeph>RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW</codeph>,
+which is effectively the same as <codeph>ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW</codeph>,
+therefore all of these examples produce the same results:
+
+<codeblock>select x, property,
+ max(x) <b>over (order by property, x desc)</b> as 'maximum to this point'
+from int_t where property in ('prime','square');
++---+----------+-----------------------+
+| x | property | maximum to this point |
++---+----------+-----------------------+
+| 7 | prime | 7 |
+| 5 | prime | 7 |
+| 3 | prime | 7 |
+| 2 | prime | 7 |
+| 9 | square | 9 |
+| 4 | square | 9 |
+| 1 | square | 9 |
++---+----------+-----------------------+
+
+select x, property,
+ max(x) over
+ (
+ <b>order by property, x desc</b>
+ <b>rows between unbounded preceding and current row</b>
+ ) as 'maximum to this point'
+from int_t where property in ('prime','square');
++---+----------+-----------------------+
+| x | property | maximum to this point |
++---+----------+-----------------------+
+| 7 | prime | 7 |
+| 5 | prime | 7 |
+| 3 | prime | 7 |
+| 2 | prime | 7 |
+| 9 | square | 9 |
+| 4 | square | 9 |
+| 1 | square | 9 |
++---+----------+-----------------------+
+
+select x, property,
+ max(x) over
+ (
+ <b>order by property, x desc</b>
+ <b>range between unbounded preceding and current row</b>
+ ) as 'maximum to this point'
+from int_t where property in ('prime','square');
++---+----------+-----------------------+
+| x | property | maximum to this point |
++---+----------+-----------------------+
+| 7 | prime | 7 |
+| 5 | prime | 7 |
+| 3 | prime | 7 |
+| 2 | prime | 7 |
+| 9 | square | 9 |
+| 4 | square | 9 |
+| 1 | square | 9 |
++---+----------+-----------------------+
+</codeblock>
+
+The following examples show how to construct a moving window, with a running maximum taking into account all rows before
+and 1 row after the current row.
+Because of a restriction in the Impala <codeph>RANGE</codeph> syntax, this type of
+moving window is possible with the <codeph>ROWS BETWEEN</codeph> clause but not the <codeph>RANGE BETWEEN</codeph> clause.
+Because of an extra Impala restriction on the <codeph>MAX()</codeph> and <codeph>MIN()</codeph> functions in an
+analytic context, the lower bound must be <codeph>UNBOUNDED PRECEDING</codeph>.
+<codeblock>select x, property,
+ max(x) over
+ (
+ <b>order by property, x</b>
+ <b>rows between unbounded preceding and 1 following</b>
+ ) as 'local maximum'
+from int_t where property in ('prime','square');
++---+----------+---------------+
+| x | property | local maximum |
++---+----------+---------------+
+| 2 | prime | 3 |
+| 3 | prime | 5 |
+| 5 | prime | 7 |
+| 7 | prime | 7 |
+| 1 | square | 7 |
+| 4 | square | 9 |
+| 9 | square | 9 |
++---+----------+---------------+
+
+-- Doesn't work because of syntax restriction on RANGE clause.
+select x, property,
+ max(x) over
+ (
+ <b>order by property, x</b>
+ <b>range between unbounded preceding and 1 following</b>
+ ) as 'local maximum'
+from int_t where property in ('prime','square');
+ERROR: AnalysisException: RANGE is only supported with both the lower and upper bounds UNBOUNDED or one UNBOUNDED and the other CURRENT ROW.
+</codeblock>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_analytic_functions.xml#analytic_functions"/>, <xref href="impala_min.xml#min"/>,
+ <xref href="impala_avg.xml#avg"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_max_errors.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_max_errors.xml b/docs/topics/impala_max_errors.xml
new file mode 100644
index 0000000..86f3618
--- /dev/null
+++ b/docs/topics/impala_max_errors.xml
@@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="max_errors">
+
+ <title>MAX_ERRORS Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ <data name="Category" value="Troubleshooting"/>
+ <data name="Category" value="Logs"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">MAX_ERRORS query option</indexterm>
+ Maximum number of non-fatal errors for any particular query that are recorded in the Impala log file. For
+ example, if a billion-row table had a non-fatal data error in every row, you could diagnose the problem
+ without all billion errors being logged. Unspecified or 0 indicates the built-in default value of 1000.
+ </p>
+
+ <p>
+ This option only controls how many errors are reported. To specify whether Impala continues or halts when it
+ encounters such errors, use the <codeph>ABORT_ON_ERROR</codeph> option.
+ </p>
+
+ <p>
+ <b>Type:</b> numeric
+ </p>
+
+ <p>
+ <b>Default:</b> 0 (meaning 1000 errors)
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+ <p>
+ <xref href="impala_abort_on_error.xml#abort_on_error"/>,
+ <xref href="impala_logging.xml#logging"/>
+ </p>
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_max_io_buffers.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_max_io_buffers.xml b/docs/topics/impala_max_io_buffers.xml
new file mode 100644
index 0000000..b08c57e
--- /dev/null
+++ b/docs/topics/impala_max_io_buffers.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="max_io_buffers">
+
+ <title>MAX_IO_BUFFERS Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ <data name="Category" value="Deprecated Features"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ Deprecated query option. Currently has no effect.
+ </p>
+
+ <p>
+ <b>Type:</b> numeric
+ </p>
+
+ <p>
+ <b>Default:</b> 0
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_max_scan_range_length.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_max_scan_range_length.xml b/docs/topics/impala_max_scan_range_length.xml
new file mode 100644
index 0000000..a790fc7
--- /dev/null
+++ b/docs/topics/impala_max_scan_range_length.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="max_scan_range_length">
+
+ <title>MAX_SCAN_RANGE_LENGTH Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">MAX_SCAN_RANGE_LENGTH query option</indexterm>
+ Maximum length of the scan range. Interacts with the number of HDFS blocks in the table to determine how many
+ CPU cores across the cluster are involved with the processing for a query. (Each core processes one scan
+ range.)
+ </p>
+
+ <p>
+ Lowering the value can sometimes increase parallelism if you have unused CPU capacity, but a too-small value
+ can limit query performance because each scan range involves extra overhead.
+ </p>
+
+ <p>
+ Only applicable to HDFS tables. Has no effect on Parquet tables. Unspecified or 0 indicates backend default,
+ which is the same as the HDFS block size for each table.
+ </p>
+
+ <p>
+ Although the scan range can be arbitrarily long, Impala internally uses an 8 MB read buffer so that it can
+ query tables with huge block sizes without allocating equivalent blocks of memory.
+ </p>
+
+ <p>
+ <b>Type:</b> numeric
+ </p>
+
+ <p>
+ <b>Default:</b> 0
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_mem_limit.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_mem_limit.xml b/docs/topics/impala_mem_limit.xml
new file mode 100644
index 0000000..fd12953
--- /dev/null
+++ b/docs/topics/impala_mem_limit.xml
@@ -0,0 +1,208 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="mem_limit">
+
+ <title>MEM_LIMIT Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ <data name="Category" value="Scalability"/>
+ <data name="Category" value="Memory"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">MEM_LIMIT query option</indexterm>
+ When resource management is not enabled, defines the maximum amount of memory a query can allocate on each node.
+ Therefore, the total memory that can be used by a query is the <codeph>MEM_LIMIT</codeph> times the number of nodes.
+ </p>
+
+ <p rev="CDH-32135">
+ There are two levels of memory limit for Impala.
+ The <codeph>-mem_limit</codeph> startup option sets an overall limit for the <cmdname>impalad</cmdname> process
+ (which handles multiple queries concurrently).
+ That limit is typically expressed in terms of a percentage of the RAM available on the host, such as <codeph>-mem_limit=70%</codeph>.
+ The <codeph>MEM_LIMIT</codeph> query option, which you set through <cmdname>impala-shell</cmdname>
+ or the <codeph>SET</codeph> statement in a JDBC or ODBC application, applies to each individual query.
+ The <codeph>MEM_LIMIT</codeph> query option is usually expressed as a fixed size such as <codeph>10gb</codeph>,
+ and must always be less than the <cmdname>impalad</cmdname> memory limit.
+ </p>
+
+ <p rev="CDH-32135">
+ If query processing exceeds the specified memory limit on any node, either the per-query limit or the
+ <cmdname>impalad</cmdname> limit, Impala cancels the query automatically.
+ Memory limits are checked periodically during query processing, so the actual memory in use
+ might briefly exceed the limit without the query being cancelled.
+ </p>
+
+ <p>
+ When resource management is enabled in CDH 5, the mechanism for this option changes. If set, it overrides the
+ automatic memory estimate from Impala. Impala requests this amount of memory from YARN on each node, and the
+ query does not proceed until that much memory is available. The actual memory used by the query could be
+ lower, since some queries use much less memory than others. With resource management, the
+ <codeph>MEM_LIMIT</codeph> setting acts both as a hard limit on the amount of memory a query can use on any
+ node (enforced by YARN) and a guarantee that that much memory will be available on each node while the query
+ is being executed. When resource management is enabled but no <codeph>MEM_LIMIT</codeph> setting is
+ specified, Impala estimates the amount of memory needed on each node for each query, requests that much
+ memory from YARN before starting the query, and then internally sets the <codeph>MEM_LIMIT</codeph> on each
+ node to the requested amount of memory during the query. Thus, if the query takes more memory than was
+ originally estimated, Impala detects that the <codeph>MEM_LIMIT</codeph> is exceeded and cancels the query
+ itself.
+ </p>
+
+ <p>
+ <b>Type:</b> numeric
+ </p>
+
+ <p rev="CDH-32135">
+ <b>Units:</b> A numeric argument represents memory size in bytes; you can also use a suffix of <codeph>m</codeph> or <codeph>mb</codeph>
+ for megabytes, or more commonly <codeph>g</codeph> or <codeph>gb</codeph> for gigabytes. If you specify a value with unrecognized
+ formats, subsequent queries fail with an error.
+ </p>
+
+ <p rev="CDH-32135">
+ <b>Default:</b> 0 (unlimited)
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p rev="CDH-32135">
+ The <codeph>MEM_LIMIT</codeph> setting is primarily useful in a high-concurrency setting,
+ or on a cluster with a workload shared between Impala and other data processing components.
+ You can prevent any query from accidentally using much more memory than expected,
+ which could negatively impact other Impala queries.
+ </p>
+
+ <p rev="CDH-32135">
+ Use the output of the <codeph>SUMMARY</codeph> command in <cmdname>impala-shell</cmdname>
+ to get a report of memory used for each phase of your most heavyweight queries on each node,
+ and then set a <codeph>MEM_LIMIT</codeph> somewhat higher than that.
+ See <xref href="impala_explain_plan.xml#perf_summary"/> for usage information about
+ the <codeph>SUMMARY</codeph> command.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb" rev="CDH-32135"/>
+
+ <p rev="CDH-32135">
+ The following examples show how to set the <codeph>MEM_LIMIT</codeph> query option
+ using a fixed number of bytes, or suffixes representing gigabytes or megabytes.
+ </p>
+
+<codeblock rev="CDH-32135">
+[localhost:21000] > set mem_limit=3000000000;
+MEM_LIMIT set to 3000000000
+[localhost:21000] > select 5;
+Query: select 5
++---+
+| 5 |
++---+
+| 5 |
++---+
+
+[localhost:21000] > set mem_limit=3g;
+MEM_LIMIT set to 3g
+[localhost:21000] > select 5;
+Query: select 5
++---+
+| 5 |
++---+
+| 5 |
++---+
+
+[localhost:21000] > set mem_limit=3gb;
+MEM_LIMIT set to 3gb
+[localhost:21000] > select 5;
++---+
+| 5 |
++---+
+| 5 |
++---+
+
+[localhost:21000] > set mem_limit=3m;
+MEM_LIMIT set to 3m
+[localhost:21000] > select 5;
++---+
+| 5 |
++---+
+| 5 |
++---+
+[localhost:21000] > set mem_limit=3mb;
+MEM_LIMIT set to 3mb
+[nightly55-2.vpc.cloudera.com:21000] > select 5;
++---+
+| 5 |
++---+
+</codeblock>
+
+ <p rev="CDH-32135">
+ The following examples show how unrecognized <codeph>MEM_LIMIT</codeph>
+ values lead to errors for subsequent queries.
+ </p>
+
+<codeblock rev="CDH-32135">
+[localhost:21000] > set mem_limit=3tb;
+MEM_LIMIT set to 3tb
+[localhost:21000] > select 5;
+ERROR: Failed to parse query memory limit from '3tb'.
+
+[localhost:21000] > set mem_limit=xyz;
+MEM_LIMIT set to xyz
+[localhost:21000] > select 5;
+Query: select 5
+ERROR: Failed to parse query memory limit from 'xyz'.
+</codeblock>
+
+ <p rev="CDH-32135">
+ The following examples shows the automatic query cancellation
+ when the <codeph>MEM_LIMIT</codeph> value is exceeded
+ on any host involved in the Impala query. First it runs a
+ successful query and checks the largest amount of memory
+ used on any node for any stage of the query.
+ Then it sets an artificially low <codeph>MEM_LIMIT</codeph>
+ setting so that the same query cannot run.
+ </p>
+
+<codeblock rev="CDH-32135">
+[localhost:21000] > select count(*) from customer;
+Query: select count(*) from customer
++----------+
+| count(*) |
++----------+
+| 150000 |
++----------+
+
+[localhost:21000] > select count(distinct c_name) from customer;
+Query: select count(distinct c_name) from customer
++------------------------+
+| count(distinct c_name) |
++------------------------+
+| 150000 |
++------------------------+
+
+[localhost:21000] > summary;
++--------------+--------+----------+----------+---------+------------+----------+---------------+---------------+
+| Operator | #Hosts | Avg Time | Max Time | #Rows | Est. #Rows | Peak Mem | Est. Peak Mem | Detail |
++--------------+--------+----------+----------+---------+------------+----------+---------------+---------------+
+| 06:AGGREGATE | 1 | 230.00ms | 230.00ms | 1 | 1 | 16.00 KB | -1 B | FINALIZE |
+| 05:EXCHANGE | 1 | 43.44us | 43.44us | 1 | 1 | 0 B | -1 B | UNPARTITIONED |
+| 02:AGGREGATE | 1 | 227.14ms | 227.14ms | 1 | 1 | 12.00 KB | 10.00 MB | |
+| 04:AGGREGATE | 1 | 126.27ms | 126.27ms | 150.00K | 150.00K | 15.17 MB | 10.00 MB | |
+| 03:EXCHANGE | 1 | 44.07ms | 44.07ms | 150.00K | 150.00K | 0 B | 0 B | HASH(c_name) |
+<b>| 01:AGGREGATE | 1 | 361.94ms | 361.94ms | 150.00K | 150.00K | 23.04 MB | 10.00 MB | |</b>
+| 00:SCAN HDFS | 1 | 43.64ms | 43.64ms | 150.00K | 150.00K | 24.19 MB | 64.00 MB | tpch.customer |
++--------------+--------+----------+----------+---------+------------+----------+---------------+---------------+
+
+[localhost:21000] > set mem_limit=15mb;
+MEM_LIMIT set to 15mb
+[localhost:21000] > select count(distinct c_name) from customer;
+Query: select count(distinct c_name) from customer
+ERROR:
+Memory limit exceeded
+Query did not have enough memory to get the minimum required buffers in the block manager.
+</codeblock>
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_min.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_min.xml b/docs/topics/impala_min.xml
new file mode 100644
index 0000000..a63fc4c
--- /dev/null
+++ b/docs/topics/impala_min.xml
@@ -0,0 +1,191 @@
+<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="min">
+
+ <title>MIN Function</title>
+ <titlealts><navtitle>MIN</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="Analytic Functions"/>
+ <data name="Category" value="Aggregate Functions"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">min() function</indexterm>
+ An aggregate function that returns the minimum value from a set of numbers. Opposite of the
+ <codeph>MAX</codeph> function. Its single argument can be numeric column, or the numeric result of a function
+ or expression applied to the column value. Rows with a <codeph>NULL</codeph> value for the specified column
+ are ignored. If the table is empty, or all the values supplied to <codeph>MIN</codeph> are
+ <codeph>NULL</codeph>, <codeph>MIN</codeph> returns <codeph>NULL</codeph>.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>MIN([DISTINCT | ALL] <varname>expression</varname>) [OVER (<varname>analytic_clause</varname>)]</codeblock>
+
+ <p>
+ When the query contains a <codeph>GROUP BY</codeph> clause, returns one value for each combination of
+ grouping values.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_sliding_window"/>
+
+ <p conref="../shared/impala_common.xml#common/return_type_same_except_string"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_aggregation_explanation"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_aggregation_example"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>-- Find the smallest value for this column in the table.
+select min(c1) from t1;
+-- Find the smallest value for this column from a subset of the table.
+select min(c1) from t1 where month = 'January' and year = '2013';
+-- Find the smallest value from a set of numeric function results.
+select min(length(s)) from t1;
+-- Can also be used in combination with DISTINCT and/or GROUP BY.
+-- Return more than one result.
+select month, year, min(purchase_price) from store_stats group by month, year;
+-- Filter the input to eliminate duplicates before performing the calculation.
+select min(distinct x) from t1;
+</codeblock>
+
+ <p rev="2.0.0">
+ The following examples show how to use <codeph>MIN()</codeph> in an analytic context. They use a table
+ containing integers from 1 to 10. Notice how the <codeph>MIN()</codeph> is reported for each input value, as
+ opposed to the <codeph>GROUP BY</codeph> clause which condenses the result set.
+<codeblock>select x, property, min(x) over (partition by property) as min from int_t where property in ('odd','even');
++----+----------+-----+
+| x | property | min |
++----+----------+-----+
+| 2 | even | 2 |
+| 4 | even | 2 |
+| 6 | even | 2 |
+| 8 | even | 2 |
+| 10 | even | 2 |
+| 1 | odd | 1 |
+| 3 | odd | 1 |
+| 5 | odd | 1 |
+| 7 | odd | 1 |
+| 9 | odd | 1 |
++----+----------+-----+
+</codeblock>
+
+Adding an <codeph>ORDER BY</codeph> clause lets you experiment with results that are cumulative or apply to a moving
+set of rows (the <q>window</q>). The following examples use <codeph>MIN()</codeph> in an analytic context
+(that is, with an <codeph>OVER()</codeph> clause) to display the smallest value of <codeph>X</codeph>
+encountered up to each row in the result set. The examples use two columns in the <codeph>ORDER BY</codeph>
+clause to produce a sequence of values that rises and falls, to illustrate how the <codeph>MIN()</codeph>
+result only decreases or stays the same throughout each partition within the result set.
+The basic <codeph>ORDER BY x</codeph> clause implicitly
+activates a window clause of <codeph>RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW</codeph>,
+which is effectively the same as <codeph>ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW</codeph>,
+therefore all of these examples produce the same results:
+
+<codeblock>select x, property, min(x) <b>over (order by property, x desc)</b> as 'minimum to this point'
+ from int_t where property in ('prime','square');
++---+----------+-----------------------+
+| x | property | minimum to this point |
++---+----------+-----------------------+
+| 7 | prime | 7 |
+| 5 | prime | 5 |
+| 3 | prime | 3 |
+| 2 | prime | 2 |
+| 9 | square | 2 |
+| 4 | square | 2 |
+| 1 | square | 1 |
++---+----------+-----------------------+
+
+select x, property,
+ min(x) over
+ (
+ <b>order by property, x desc</b>
+ <b>range between unbounded preceding and current row</b>
+ ) as 'minimum to this point'
+from int_t where property in ('prime','square');
++---+----------+-----------------------+
+| x | property | minimum to this point |
++---+----------+-----------------------+
+| 7 | prime | 7 |
+| 5 | prime | 5 |
+| 3 | prime | 3 |
+| 2 | prime | 2 |
+| 9 | square | 2 |
+| 4 | square | 2 |
+| 1 | square | 1 |
++---+----------+-----------------------+
+
+select x, property,
+ min(x) over
+ (
+ <b>order by property, x desc</b>
+ <b>rows between unbounded preceding and current row</b>
+ ) as 'minimum to this point'
+from int_t where property in ('prime','square');
++---+----------+-----------------------+
+| x | property | minimum to this point |
++---+----------+-----------------------+
+| 7 | prime | 7 |
+| 5 | prime | 5 |
+| 3 | prime | 3 |
+| 2 | prime | 2 |
+| 9 | square | 2 |
+| 4 | square | 2 |
+| 1 | square | 1 |
++---+----------+-----------------------+
+</codeblock>
+
+The following examples show how to construct a moving window, with a running minimum taking into account all rows before
+and 1 row after the current row.
+Because of a restriction in the Impala <codeph>RANGE</codeph> syntax, this type of
+moving window is possible with the <codeph>ROWS BETWEEN</codeph> clause but not the <codeph>RANGE BETWEEN</codeph> clause.
+Because of an extra Impala restriction on the <codeph>MAX()</codeph> and <codeph>MIN()</codeph> functions in an
+analytic context, the lower bound must be <codeph>UNBOUNDED PRECEDING</codeph>.
+<codeblock>select x, property,
+ min(x) over
+ (
+ <b>order by property, x desc</b>
+ <b>rows between unbounded preceding and 1 following</b>
+ ) as 'local minimum'
+from int_t where property in ('prime','square');
++---+----------+---------------+
+| x | property | local minimum |
++---+----------+---------------+
+| 7 | prime | 5 |
+| 5 | prime | 3 |
+| 3 | prime | 2 |
+| 2 | prime | 2 |
+| 9 | square | 2 |
+| 4 | square | 1 |
+| 1 | square | 1 |
++---+----------+---------------+
+
+-- Doesn't work because of syntax restriction on RANGE clause.
+select x, property,
+ min(x) over
+ (
+ <b>order by property, x desc</b>
+ <b>range between unbounded preceding and 1 following</b>
+ ) as 'local minimum'
+from int_t where property in ('prime','square');
+ERROR: AnalysisException: RANGE is only supported with both the lower and upper bounds UNBOUNDED or one UNBOUNDED and the other CURRENT ROW.
+</codeblock>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_analytic_functions.xml#analytic_functions"/>, <xref href="impala_max.xml#max"/>,
+ <xref href="impala_avg.xml#avg"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_misc_functions.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_misc_functions.xml b/docs/topics/impala_misc_functions.xml
new file mode 100644
index 0000000..bb9f062
--- /dev/null
+++ b/docs/topics/impala_misc_functions.xml
@@ -0,0 +1,148 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="misc_functions">
+
+ <title>Impala Miscellaneous Functions</title>
+ <titlealts><navtitle>Miscellaneous Functions</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ Impala supports the following utility functions that do not operate on a particular column or data type:
+ </p>
+
+ <dl>
+ <dlentry rev="1.3.0" id="current_database">
+
+ <dt>
+ <codeph>current_database()</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">current_database() function</indexterm>
+ <b>Purpose:</b> Returns the database that the session is currently using, either <codeph>default</codeph>
+ if no database has been selected, or whatever database the session switched to through a
+ <codeph>USE</codeph> statement or the <cmdname>impalad</cmdname><codeph>-d</codeph> option.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="5.4.5" id="effective_user">
+
+ <dt>
+ <codeph>effective_user()</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">effective_user() function</indexterm>
+ <b>Purpose:</b> Typically returns the same value as <codeph>user()</codeph>,
+ except if delegation is enabled, in which case it returns the ID of the delegated user.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ <p>
+ <b>Added in:</b> CDH 5.4.5
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.3.0" id="pid">
+
+ <dt>
+ <codeph>pid()</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">pid() function</indexterm>
+ <b>Purpose:</b> Returns the process ID of the <cmdname>impalad</cmdname> daemon that the session is
+ connected to. You can use it during low-level debugging, to issue Linux commands that trace, show the
+ arguments, and so on the <cmdname>impalad</cmdname> process.
+ <p>
+ <b>Return type:</b> <codeph>int</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry audience="Cloudera" id="sleep">
+
+ <dt>
+ <codeph>sleep(int ms)</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">sleep() function</indexterm>
+ <b>Purpose:</b> Pauses the query for a specified number of milliseconds. For slowing down queries with
+ small result sets enough to monitor runtime execution, memory usage, or other factors that otherwise
+ would be difficult to capture during the brief interval of query execution. When used in the
+ <codeph>SELECT</codeph> list, it is called once for each row in the result set; adjust the number of
+ milliseconds accordingly. For example, a query <codeph>SELECT *, sleep(5) FROM
+ table_with_1000_rows</codeph> would take at least 5 seconds to complete (5 milliseconds * 1000 rows in
+ result set). To avoid an excessive number of concurrent queries, use this function for troubleshooting on
+ test and development systems, not for production queries.
+ <p>
+ <b>Return type:</b> N/A
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry rev="1.1" id="user">
+
+ <dt>
+ <codeph>user()</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">user() function</indexterm>
+ <b>Purpose:</b> Returns the username of the Linux user who is connected to the <cmdname>impalad</cmdname>
+ daemon. Typically called a single time, in a query without any <codeph>FROM</codeph> clause, to
+ understand how authorization settings apply in a security context; once you know the logged-in user name,
+ you can check which groups that user belongs to, and from the list of groups you can check which roles
+ are available to those groups through the authorization policy file.
+ <p conref="../shared/impala_common.xml#common/user_kerberized"/>
+ <p>
+ When delegation is enabled, consider calling the <codeph>effective_user()</codeph> function instead.
+ </p>
+ <p>
+ <b>Return type:</b> <codeph>string</codeph>
+ </p>
+ </dd>
+
+ </dlentry>
+
+ <dlentry id="version">
+
+ <dt>
+ <codeph>version()</codeph>
+ </dt>
+
+ <dd>
+ <indexterm audience="Cloudera">version() function</indexterm>
+ <b>Purpose:</b> Returns information such as the precise version number and build date for the
+ <codeph>impalad</codeph> daemon that you are currently connected to. Typically used to confirm that you
+ are connected to the expected level of Impala to use a particular feature, or to connect to several nodes
+ and confirm they are all running the same level of <cmdname>impalad</cmdname>.
+ <p>
+ <b>Return type:</b> <codeph>string</codeph> (with one or more embedded newlines)
+ </p>
+ </dd>
+
+ </dlentry>
+ </dl>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_ndv.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_ndv.xml b/docs/topics/impala_ndv.xml
new file mode 100644
index 0000000..a1e5527
--- /dev/null
+++ b/docs/topics/impala_ndv.xml
@@ -0,0 +1,133 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.2.1" id="ndv">
+
+ <title>NDV Function</title>
+ <titlealts><navtitle>NDV</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Impala Functions"/>
+ <data name="Category" value="Aggregate Functions"/>
+ <data name="Category" value="Querying"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">ndv() function</indexterm>
+ An aggregate function that returns an approximate value similar to the result of <codeph>COUNT(DISTINCT
+ <varname>col</varname>)</codeph>, the <q>number of distinct values</q>. It is much faster than the
+ combination of <codeph>COUNT</codeph> and <codeph>DISTINCT</codeph>, and uses a constant amount of memory and
+ thus is less memory-intensive for columns with high cardinality.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>NDV([DISTINCT | ALL] <varname>expression</varname>)</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p rev="1.2.2">
+ This is the mechanism used internally by the <codeph>COMPUTE STATS</codeph> statement for computing the
+ number of distinct values in a column.
+ </p>
+
+ <p>
+ Because this number is an estimate, it might not reflect the precise number of different values in the
+ column, especially if the cardinality is very low or very high. If the estimated number is higher than the
+ number of rows in the table, Impala adjusts the value internally during query planning.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/former_odd_return_type_string"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/restrictions_sliding_window"/> -->
+
+ <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_aggregation_explanation"/>
+
+ <p conref="../shared/impala_common.xml#common/complex_types_aggregation_example"/>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/analytic_not_allowed_caveat"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example queries a billion-row table to illustrate the relative performance of
+ <codeph>COUNT(DISTINCT)</codeph> and <codeph>NDV()</codeph>. It shows how <codeph>COUNT(DISTINCT)</codeph>
+ gives a precise answer, but is inefficient for large-scale data where an approximate result is sufficient.
+ The <codeph>NDV()</codeph> function gives an approximate result but is much faster.
+ </p>
+
+<codeblock>select count(distinct col1) from sample_data;
++---------------------+
+| count(distinct col1)|
++---------------------+
+| 100000 |
++---------------------+
+Fetched 1 row(s) in 20.13s
+
+select cast(ndv(col1) as bigint) as col1 from sample_data;
++----------+
+| col1 |
++----------+
+| 139017 |
++----------+
+Fetched 1 row(s) in 8.91s
+</codeblock>
+
+ <p>
+ The following example shows how you can code multiple <codeph>NDV()</codeph> calls in a single query, to
+ easily learn which columns have substantially more or fewer distinct values. This technique is faster than
+ running a sequence of queries with <codeph>COUNT(DISTINCT)</codeph> calls.
+ </p>
+
+<codeblock>select cast(ndv(col1) as bigint) as col1, cast(ndv(col2) as bigint) as col2,
+ cast(ndv(col3) as bigint) as col3, cast(ndv(col4) as bigint) as col4
+ from sample_data;
++----------+-----------+------------+-----------+
+| col1 | col2 | col3 | col4 |
++----------+-----------+------------+-----------+
+| 139017 | 282 | 46 | 145636240 |
++----------+-----------+------------+-----------+
+Fetched 1 row(s) in 34.97s
+
+select count(distinct col1) from sample_data;
++---------------------+
+| count(distinct col1)|
++---------------------+
+| 100000 |
++---------------------+
+Fetched 1 row(s) in 20.13s
+
+select count(distinct col2) from sample_data;
++----------------------+
+| count(distinct col2) |
++----------------------+
+| 278 |
++----------------------+
+Fetched 1 row(s) in 20.09s
+
+select count(distinct col3) from sample_data;
++-----------------------+
+| count(distinct col3) |
++-----------------------+
+| 46 |
++-----------------------+
+Fetched 1 row(s) in 19.12s
+
+select count(distinct col4) from sample_data;
++----------------------+
+| count(distinct col4) |
++----------------------+
+| 147135880 |
++----------------------+
+Fetched 1 row(s) in 266.95s
+</codeblock>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_num_nodes.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_num_nodes.xml b/docs/topics/impala_num_nodes.xml
new file mode 100644
index 0000000..75ae8e8
--- /dev/null
+++ b/docs/topics/impala_num_nodes.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="num_nodes">
+
+ <title>NUM_NODES Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">NUM_NODES query option</indexterm>
+ Limit the number of nodes that process a query, typically during debugging.
+
+ </p>
+
+ <p>
+ <b>Type:</b> numeric
+ </p>
+
+<p>
+ <b>Allowed values:</b> Only accepts the values 0
+ (meaning all nodes) or 1 (meaning all work is done on the coordinator node).
+</p>
+
+ <p>
+ <b>Default:</b> 0
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ If you are diagnosing a problem that you suspect is due to a timing issue due to
+ distributed query processing, you can set <codeph>NUM_NODES=1</codeph> to verify
+ if the problem still occurs when all the work is done on a single node.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/num_nodes_tip"/>
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_num_scanner_threads.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_num_scanner_threads.xml b/docs/topics/impala_num_scanner_threads.xml
new file mode 100644
index 0000000..27cf883
--- /dev/null
+++ b/docs/topics/impala_num_scanner_threads.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="num_scanner_threads">
+
+ <title>NUM_SCANNER_THREADS Query Option</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">NUM_SCANNER_THREADS query option</indexterm>
+ Maximum number of scanner threads (on each node) used for each query. By default, Impala uses as many cores
+ as are available (one thread per core). You might lower this value if queries are using excessive resources
+ on a busy cluster. Impala imposes a maximum value automatically, so a high value has no practical effect.
+ </p>
+
+ <p>
+ <b>Type:</b> numeric
+ </p>
+
+ <p>
+ <b>Default:</b> 0
+ </p>
+
+ <note conref="../shared/impala_common.xml#common/compute_stats_parquet"/>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_offset.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_offset.xml b/docs/topics/impala_offset.xml
new file mode 100644
index 0000000..c9c073d
--- /dev/null
+++ b/docs/topics/impala_offset.xml
@@ -0,0 +1,64 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.2.1" id="offset">
+
+ <title>OFFSET Clause</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Reports"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ The <codeph>OFFSET</codeph> clause in a <codeph>SELECT</codeph> query causes the result set to start some
+ number of rows after the logical first item. The result set is numbered starting from zero, so <codeph>OFFSET
+ 0</codeph> produces the same result as leaving out the <codeph>OFFSET</codeph> clause. Always use this clause
+ in combination with <codeph>ORDER BY</codeph> (so that it is clear which item should be first, second, and so
+ on) and <codeph>LIMIT</codeph> (so that the result set covers a bounded range, such as items 0-9, 100-199,
+ and so on).
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/limit_and_offset"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example shows how you could run a <q>paging</q> query originally written for a traditional
+ database application. Because typical Impala queries process megabytes or gigabytes of data and read large
+ data files from disk each time, it is inefficient to run a separate query to retrieve each small group of
+ items. Use this technique only for compatibility while porting older applications, then rewrite the
+ application code to use a single query with a large result set, and display pages of results from the cached
+ result set.
+ </p>
+
+<codeblock>[localhost:21000] > create table numbers (x int);
+[localhost:21000] > insert into numbers select x from very_long_sequence;
+Inserted 1000000 rows in 1.34s
+[localhost:21000] > select x from numbers order by x limit 5 offset 0;
++----+
+| x |
++----+
+| 1 |
+| 2 |
+| 3 |
+| 4 |
+| 5 |
++----+
+[localhost:21000] > select x from numbers order by x limit 5 offset 5;
++----+
+| x |
++----+
+| 6 |
+| 7 |
+| 8 |
+| 9 |
+| 10 |
++----+
+</codeblock>
+ </conbody>
+</concept>
[03/22] incubator-impala git commit: First try at porting over the
source files necessary for the Impala SQL Reference.
Posted by jr...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_timestamp.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_timestamp.xml b/docs/topics/impala_timestamp.xml
new file mode 100644
index 0000000..c469b54
--- /dev/null
+++ b/docs/topics/impala_timestamp.xml
@@ -0,0 +1,441 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="timestamp">
+
+ <title>TIMESTAMP Data Type</title>
+ <titlealts><navtitle>TIMESTAMP</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Data Types"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Dates and Times"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ A data type used in <codeph>CREATE TABLE</codeph> and <codeph>ALTER TABLE</codeph> statements, representing a
+ point in time.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+ <p>
+ In the column definition of a <codeph>CREATE TABLE</codeph> statement:
+ </p>
+
+<codeblock><varname>column_name</varname> TIMESTAMP</codeblock>
+
+ <p>
+ <b>Range:</b> Allowed date values range from 1400-01-01 to 9999-12-31; this range is different from the Hive
+ <codeph>TIMESTAMP</codeph> type. Internally, the resolution of the time portion of a
+ <codeph>TIMESTAMP</codeph> value is in nanoseconds.
+ </p>
+
+ <p>
+ <b>INTERVAL expressions:</b>
+ </p>
+
+ <p>
+ You can perform date arithmetic by adding or subtracting a specified number of time units, using the
+ <codeph>INTERVAL</codeph> keyword and the <codeph>+</codeph> and <codeph>-</codeph> operators or
+ <codeph>date_add()</codeph> and <codeph>date_sub()</codeph> functions. You can specify units as
+ <codeph>YEAR[S]</codeph>, <codeph>MONTH[S]</codeph>, <codeph>WEEK[S]</codeph>, <codeph>DAY[S]</codeph>,
+ <codeph>HOUR[S]</codeph>, <codeph>MINUTE[S]</codeph>, <codeph>SECOND[S]</codeph>,
+ <codeph>MILLISECOND[S]</codeph>, <codeph>MICROSECOND[S]</codeph>, and <codeph>NANOSECOND[S]</codeph>. You can
+ only specify one time unit in each interval expression, for example <codeph>INTERVAL 3 DAYS</codeph> or
+ <codeph>INTERVAL 25 HOURS</codeph>, but you can produce any granularity by adding together successive
+ <codeph>INTERVAL</codeph> values, such as <codeph><varname>timestamp_value</varname> + INTERVAL 3 WEEKS -
+ INTERVAL 1 DAY + INTERVAL 10 MICROSECONDS</codeph>.
+ </p>
+
+ <p>
+ For example:
+ </p>
+
+<codeblock>select now() + interval 1 day;
+select date_sub(now(), interval 5 minutes);
+insert into auction_details
+ select auction_id, auction_start_time, auction_start_time + interval 2 days + interval 12 hours
+ from new_auctions;</codeblock>
+
+ <p>
+ <b>Time zones:</b>
+ </p>
+
+ <p>
+ By default, Impala does not store timestamps using the local timezone, to avoid undesired results from
+ unexpected time zone issues. Timestamps are stored and interpreted relative to UTC, both when written to or
+ read from data files, or when converted to or from Unix time values through functions such as
+ <codeph>from_unixtime()</codeph> or <codeph>unix_timestamp()</codeph>. To convert such a
+ <codeph>TIMESTAMP</codeph> value to one that represents the date and time in a specific time zone, convert
+ the original value with the <codeph>from_utc_timestamp()</codeph> function.
+ </p>
+
+ <p>
+ Because Impala does not assume that <codeph>TIMESTAMP</codeph> values are in any particular time zone, you
+ must be conscious of the time zone aspects of data that you query, insert, or convert.
+ </p>
+
+ <p>
+ For consistency with Unix system calls, the <codeph>TIMESTAMP</codeph> returned by the <codeph>now()</codeph>
+ function represents the local time in the system time zone, rather than in UTC. To store values relative to
+ the current time in a portable way, convert any <codeph>now()</codeph> return values using the
+ <codeph>to_utc_timestamp()</codeph> function first. For example, the following example shows that the current
+ time in California (where Cloudera HQ is located) is shortly after 2 PM. If that value was written to a data
+ file, and shipped off to a distant server to be analyzed alongside other data from far-flung locations, the
+ dates and times would not match up precisely because of time zone differences. Therefore, the
+ <codeph>to_utc_timestamp()</codeph> function converts it using a common reference point, the UTC time zone
+ (descended from the old Greenwich Mean Time standard). The <codeph>'PDT'</codeph> argument indicates that the
+ original value is from the Pacific time zone with Daylight Saving Time in effect. When servers in all
+ geographic locations run the same transformation on any local date and time values (with the appropriate time
+ zone argument), the stored data uses a consistent representation. Impala queries can use functions such as
+ <codeph>EXTRACT()</codeph>, <codeph>MIN()</codeph>, <codeph>AVG()</codeph>, and so on to do time-series
+ analysis on those timestamps.
+ </p>
+
+<codeblock>[localhost:21000] > select now();
++-------------------------------+
+| now() |
++-------------------------------+
+| 2015-04-09 14:07:46.580465000 |
++-------------------------------+
+[localhost:21000] > select to_utc_timestamp(now(), 'PDT');
++--------------------------------+
+| to_utc_timestamp(now(), 'pdt') |
++--------------------------------+
+| 2015-04-09 21:08:07.664547000 |
++--------------------------------+
+</codeblock>
+
+ <p>
+ The converse function, <codeph>from_utc_timestamp()</codeph>, lets you take stored <codeph>TIMESTAMP</codeph>
+ data or calculated results and convert back to local date and time for processing on the application side.
+ The following example shows how you might represent some future date (such as the ending date and time of an
+ auction) in UTC, and then convert back to local time when convenient for reporting or other processing. The
+ final query in the example tests whether this arbitrary UTC date and time has passed yet, by converting it
+ back to the local time zone and comparing it against the current date and time.
+ </p>
+
+<codeblock>[localhost:21000] > select to_utc_timestamp(now() + interval 2 weeks, 'PDT');
++---------------------------------------------------+
+| to_utc_timestamp(now() + interval 2 weeks, 'pdt') |
++---------------------------------------------------+
+| 2015-04-23 21:08:34.152923000 |
++---------------------------------------------------+
+[localhost:21000] > select from_utc_timestamp('2015-04-23 21:08:34.152923000','PDT');
++------------------------------------------------------------+
+| from_utc_timestamp('2015-04-23 21:08:34.152923000', 'pdt') |
++------------------------------------------------------------+
+| 2015-04-23 14:08:34.152923000 |
++------------------------------------------------------------+
+[localhost:21000] > select from_utc_timestamp('2015-04-23 21:08:34.152923000','PDT') < now();
++--------------------------------------------------------------------+
+| from_utc_timestamp('2015-04-23 21:08:34.152923000', 'pdt') < now() |
++--------------------------------------------------------------------+
+| false |
++--------------------------------------------------------------------+
+</codeblock>
+
+ <p rev="2.2.0">
+ If you have data files written by Hive, those <codeph>TIMESTAMP</codeph> values represent the local timezone
+ of the host where the data was written, potentially leading to inconsistent results when processed by Impala.
+ To avoid compatibility problems or having to code workarounds, you can specify one or both of these
+ <cmdname>impalad</cmdname> startup flags: <codeph>-use_local_tz_for_unix_timestamp_conversions=true</codeph>
+ <codeph>-convert_legacy_hive_parquet_utc_timestamps=true</codeph>. Although
+ <codeph>-convert_legacy_hive_parquet_utc_timestamps</codeph> is turned off by default to avoid performance overhead, Cloudera recommends
+ turning it on when processing <codeph>TIMESTAMP</codeph> columns in Parquet files written by Hive, to avoid unexpected behavior.
+ </p>
+
+ <p rev="2.2.0">
+ The <codeph>-use_local_tz_for_unix_timestamp_conversions</codeph> setting affects conversions from
+ <codeph>TIMESTAMP</codeph> to <codeph>BIGINT</codeph>, or from <codeph>BIGINT</codeph>
+ to <codeph>TIMESTAMP</codeph>. By default, Impala treats all <codeph>TIMESTAMP</codeph> values as UTC,
+ to simplify analysis of time-series data from different geographic regions. When you enable the
+ <codeph>-use_local_tz_for_unix_timestamp_conversions</codeph> setting, these operations
+ treat the input values as if they are in the local tie zone of the host doing the processing.
+ See <xref href="impala_datetime_functions.xml#datetime_functions"/> for the list of functions
+ affected by the <codeph>-use_local_tz_for_unix_timestamp_conversions</codeph> setting.
+ </p>
+
+ <p>
+ The following sequence of examples shows how the interpretation of <codeph>TIMESTAMP</codeph> values in
+ Parquet tables is affected by the setting of the <codeph>-convert_legacy_hive_parquet_utc_timestamps</codeph>
+ setting.
+ </p>
+
+ <p>
+ Regardless of the <codeph>-convert_legacy_hive_parquet_utc_timestamps</codeph> setting,
+ <codeph>TIMESTAMP</codeph> columns in text tables can be written and read interchangeably by Impala and Hive:
+ </p>
+
+<codeblock>Impala DDL and queries for text table:
+
+[localhost:21000] > create table t1 (x timestamp);
+[localhost:21000] > insert into t1 values (now()), (now() + interval 1 day);
+[localhost:21000] > select x from t1;
++-------------------------------+
+| x |
++-------------------------------+
+| 2015-04-07 15:43:02.892403000 |
+| 2015-04-08 15:43:02.892403000 |
++-------------------------------+
+[localhost:21000] > select to_utc_timestamp(x, 'PDT') from t1;
++-------------------------------+
+| to_utc_timestamp(x, 'pdt') |
++-------------------------------+
+| 2015-04-07 22:43:02.892403000 |
+| 2015-04-08 22:43:02.892403000 |
++-------------------------------+
+
+Hive query for text table:
+
+hive> select * from t1;
+OK
+2015-04-07 15:43:02.892403
+2015-04-08 15:43:02.892403
+Time taken: 1.245 seconds, Fetched: 2 row(s)
+</codeblock>
+
+ <p>
+ When the table uses Parquet format, Impala expects any time zone adjustment to be applied prior to writing,
+ while <codeph>TIMESTAMP</codeph> values written by Hive are adjusted to be in the UTC time zone. When Hive
+ queries Parquet data files that it wrote, it adjusts the <codeph>TIMESTAMP</codeph> values back to the local
+ time zone, while Impala does no conversion. Hive does no time zone conversion when it queries Impala-written
+ Parquet files.
+ </p>
+
+<codeblock>Impala DDL and queries for Parquet table:
+
+[localhost:21000] > create table p1 stored as parquet as select x from t1;
++-------------------+
+| summary |
++-------------------+
+| Inserted 2 row(s) |
++-------------------+
+[localhost:21000] > select x from p1;
++-------------------------------+
+| x |
++-------------------------------+
+| 2015-04-07 15:43:02.892403000 |
+| 2015-04-08 15:43:02.892403000 |
++-------------------------------+
+
+Hive DDL and queries for Parquet table:
+
+hive> create table h1 (x timestamp) stored as parquet;
+OK
+hive> insert into h1 select * from p1;
+...
+OK
+Time taken: 35.573 seconds
+hive> select x from p1;
+OK
+2015-04-07 15:43:02.892403
+2015-04-08 15:43:02.892403
+Time taken: 0.324 seconds, Fetched: 2 row(s)
+hive> select x from h1;
+OK
+2015-04-07 15:43:02.892403
+2015-04-08 15:43:02.892403
+Time taken: 0.197 seconds, Fetched: 2 row(s)
+</codeblock>
+
+ <p>
+ The discrepancy arises when Impala queries the Hive-created Parquet table. The underlying values in the
+ <codeph>TIMESTAMP</codeph> column are different from the ones written by Impala, even though they were copied
+ from one table to another by an <codeph>INSERT ... SELECT</codeph> statement in Hive. Hive did an implicit
+ conversion from the local time zone to UTC as it wrote the values to Parquet.
+ </p>
+
+<codeblock>Impala query for TIMESTAMP values from Impala-written and Hive-written data:
+
+[localhost:21000] > select * from p1;
++-------------------------------+
+| x |
++-------------------------------+
+| 2015-04-07 15:43:02.892403000 |
+| 2015-04-08 15:43:02.892403000 |
++-------------------------------+
+Fetched 2 row(s) in 0.29s
+[localhost:21000] > select * from h1;
++-------------------------------+
+| x |
++-------------------------------+
+| 2015-04-07 22:43:02.892403000 |
+| 2015-04-08 22:43:02.892403000 |
++-------------------------------+
+Fetched 2 row(s) in 0.41s
+
+Underlying integer values for Impala-written and Hive-written data:
+
+[localhost:21000] > select cast(x as bigint) from p1;
++-------------------+
+| cast(x as bigint) |
++-------------------+
+| 1428421382 |
+| 1428507782 |
++-------------------+
+Fetched 2 row(s) in 0.38s
+[localhost:21000] > select cast(x as bigint) from h1;
++-------------------+
+| cast(x as bigint) |
++-------------------+
+| 1428446582 |
+| 1428532982 |
++-------------------+
+Fetched 2 row(s) in 0.20s
+</codeblock>
+
+ <p>
+ When the <codeph>-convert_legacy_hive_parquet_utc_timestamps</codeph> setting is enabled, Impala recognizes
+ the Parquet data files written by Hive, and applies the same UTC-to-local-timezone conversion logic during
+ the query as Hive uses, making the contents of the Impala-written <codeph>P1</codeph> table and the
+ Hive-written <codeph>H1</codeph> table appear identical, whether represented as <codeph>TIMESTAMP</codeph>
+ values or the underlying <codeph>BIGINT</codeph> integers:
+ </p>
+
+<codeblock>[localhost:21000] > select x from p1;
++-------------------------------+
+| x |
++-------------------------------+
+| 2015-04-07 15:43:02.892403000 |
+| 2015-04-08 15:43:02.892403000 |
++-------------------------------+
+Fetched 2 row(s) in 0.37s
+[localhost:21000] > select x from h1;
++-------------------------------+
+| x |
++-------------------------------+
+| 2015-04-07 15:43:02.892403000 |
+| 2015-04-08 15:43:02.892403000 |
++-------------------------------+
+Fetched 2 row(s) in 0.19s
+[localhost:21000] > select cast(x as bigint) from p1;
++-------------------+
+| cast(x as bigint) |
++-------------------+
+| 1428446582 |
+| 1428532982 |
++-------------------+
+Fetched 2 row(s) in 0.29s
+[localhost:21000] > select cast(x as bigint) from h1;
++-------------------+
+| cast(x as bigint) |
++-------------------+
+| 1428446582 |
+| 1428532982 |
++-------------------+
+Fetched 2 row(s) in 0.22s
+</codeblock>
+
+ <p>
+ <b>Conversions:</b>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/timestamp_conversions"/>
+
+ <p>
+ In Impala 1.3 and higher, the <codeph>FROM_UNIXTIME()</codeph> and <codeph>UNIX_TIMESTAMP()</codeph>
+ functions allow a wider range of format strings, with more flexibility in element order, repetition of letter
+ placeholders, and separator characters. In CDH 5.5 / Impala 2.3 and higher, the <codeph>UNIX_TIMESTAMP()</codeph>
+ function also allows a numeric timezone offset to be specified as part of the input string.
+ See <xref href="impala_datetime_functions.xml#datetime_functions"/> for details.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/y2k38"/>
+
+ <p>
+ <b>Partitioning:</b>
+ </p>
+
+ <p>
+ Although you cannot use a <codeph>TIMESTAMP</codeph> column as a partition key, you can extract the
+ individual years, months, days, hours, and so on and partition based on those columns. Because the partition
+ key column values are represented in HDFS directory names, rather than as fields in the data files
+ themselves, you can also keep the original <codeph>TIMESTAMP</codeph> values if desired, without duplicating
+ data or wasting storage space. See <xref href="impala_partitioning.xml#partition_key_columns"/> for more
+ details on partitioning with date and time values.
+ </p>
+
+<codeblock>[localhost:21000] > create table timeline (event string) partitioned by (happened timestamp);
+ERROR: AnalysisException: Type 'TIMESTAMP' is not supported as partition-column type in column: happened
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>select cast('1966-07-30' as timestamp);
+select cast('1985-09-25 17:45:30.005' as timestamp);
+select cast('08:30:00' as timestamp);
+select hour('1970-01-01 15:30:00'); -- Succeeds, returns 15.
+select hour('1970-01-01 15:30'); -- Returns NULL because seconds field required.
+select hour('1970-01-01 27:30:00'); -- Returns NULL because hour value out of range.
+select dayofweek('2004-06-13'); -- Returns 1, representing Sunday.
+select dayname('2004-06-13'); -- Returns 'Sunday'.
+select date_add('2004-06-13', 365); -- Returns 2005-06-13 with zeros for hh:mm:ss fields.
+select day('2004-06-13'); -- Returns 13.
+select datediff('1989-12-31','1984-09-01'); -- How many days between these 2 dates?
+select now(); -- Returns current date and time in local timezone.
+
+create table dates_and_times (t timestamp);
+insert into dates_and_times values
+ ('1966-07-30'), ('1985-09-25 17:45:30.005'), ('08:30:00'), (now());
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/null_bad_timestamp_cast"/>
+
+ <p conref="../shared/impala_common.xml#common/partitioning_worrisome"/>
+
+ <p conref="../shared/impala_common.xml#common/hbase_ok"/>
+
+ <p conref="../shared/impala_common.xml#common/parquet_ok"/>
+
+ <p conref="../shared/impala_common.xml#common/text_bulky"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/compatibility_blurb"/> -->
+
+ <p conref="../shared/impala_common.xml#common/internals_16_bytes"/>
+
+ <p conref="../shared/impala_common.xml#common/added_forever"/>
+
+ <p conref="../shared/impala_common.xml#common/column_stats_constant"/>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <p>
+ If you cast a <codeph>STRING</codeph> with an unrecognized format to a <codeph>TIMESTAMP</codeph>, the result
+ is <codeph>NULL</codeph> rather than an error. Make sure to test your data pipeline to be sure any textual
+ date and time values are in a format that Impala <codeph>TIMESTAMP</codeph> can recognize.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/avro_no_timestamp"/>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <ul>
+ <li>
+<!-- The Timestamp Literals topic is pretty brief. Consider adding more examples there. -->
+ <xref href="impala_literals.xml#timestamp_literals"/>.
+ </li>
+
+ <li>
+ To convert to or from different date formats, or perform date arithmetic, use the date and time functions
+ described in <xref href="impala_datetime_functions.xml#datetime_functions"/>. In particular, the
+ <codeph>from_unixtime()</codeph> function requires a case-sensitive format string such as
+ <codeph>"yyyy-MM-dd HH:mm:ss.SSSS"</codeph>, matching one of the allowed variations of a
+ <codeph>TIMESTAMP</codeph> value (date plus time, only date, only time, optional fractional seconds).
+ </li>
+
+ <li>
+ See <xref href="impala_langref_unsupported.xml#langref_hiveql_delta"/> for details about differences in
+ <codeph>TIMESTAMP</codeph> handling between Impala and Hive.
+ </li>
+ </ul>
+
+ </conbody>
+
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_tinyint.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_tinyint.xml b/docs/topics/impala_tinyint.xml
new file mode 100644
index 0000000..2b1b3a8
--- /dev/null
+++ b/docs/topics/impala_tinyint.xml
@@ -0,0 +1,101 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="tinyint">
+
+ <title>TINYINT Data Type</title>
+ <titlealts><navtitle>TINYINT</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Data Types"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Data Analysts"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Schemas"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ A 1-byte integer data type used in <codeph>CREATE TABLE</codeph> and <codeph>ALTER TABLE</codeph> statements.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+ <p>
+ In the column definition of a <codeph>CREATE TABLE</codeph> statement:
+ </p>
+
+<codeblock><varname>column_name</varname> TINYINT</codeblock>
+
+ <p>
+ <b>Range:</b> -128 .. 127. There is no <codeph>UNSIGNED</codeph> subtype.
+ </p>
+
+ <p>
+ <b>Conversions:</b> Impala automatically converts to a larger integer type (<codeph>SMALLINT</codeph>,
+ <codeph>INT</codeph>, or <codeph>BIGINT</codeph>) or a floating-point type (<codeph>FLOAT</codeph> or
+ <codeph>DOUBLE</codeph>) automatically. Use <codeph>CAST()</codeph> to convert to <codeph>STRING</codeph> or
+ <codeph>TIMESTAMP</codeph>.
+ <ph conref="../shared/impala_common.xml#common/cast_int_to_timestamp"/>
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ For a convenient and automated way to check the bounds of the <codeph>TINYINT</codeph> type, call the
+ functions <codeph>MIN_TINYINT()</codeph> and <codeph>MAX_TINYINT()</codeph>.
+ </p>
+
+ <p>
+ If an integer value is too large to be represented as a <codeph>TINYINT</codeph>, use a
+ <codeph>SMALLINT</codeph> instead.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/null_bad_numeric_cast"/>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>CREATE TABLE t1 (x TINYINT);
+SELECT CAST(100 AS TINYINT);
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/parquet_blurb"/>
+
+<!-- Duplicated under TINYINT and SMALLINT. Turn into a conref in both places. -->
+
+ <p rev="1.4.0">
+ Physically, Parquet files represent <codeph>TINYINT</codeph> and <codeph>SMALLINT</codeph> values as 32-bit
+ integers. Although Impala rejects attempts to insert out-of-range values into such columns, if you create a
+ new table with the <codeph>CREATE TABLE ... LIKE PARQUET</codeph> syntax, any <codeph>TINYINT</codeph> or
+ <codeph>SMALLINT</codeph> columns in the original table turn into <codeph>INT</codeph> columns in the new
+ table.
+ </p>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/partitioning_blurb"/> -->
+
+ <p conref="../shared/impala_common.xml#common/hbase_ok"/>
+
+ <p conref="../shared/impala_common.xml#common/text_bulky"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/compatibility_blurb"/> -->
+
+ <p conref="../shared/impala_common.xml#common/internals_1_bytes"/>
+
+ <p conref="../shared/impala_common.xml#common/added_forever"/>
+
+ <p conref="../shared/impala_common.xml#common/column_stats_constant"/>
+
+<!-- <p conref="/Content/impala_common_xi44078.xml#common/restrictions_blurb"/> -->
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_literals.xml#numeric_literals"/>, <xref href="impala_tinyint.xml#tinyint"/>,
+ <xref href="impala_smallint.xml#smallint"/>, <xref href="impala_int.xml#int"/>,
+ <xref href="impala_bigint.xml#bigint"/>, <xref href="impala_decimal.xml#decimal"/>,
+ <xref href="impala_math_functions.xml#math_functions"/>
+ </p>
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_truncate_table.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_truncate_table.xml b/docs/topics/impala_truncate_table.xml
new file mode 100644
index 0000000..9f0d00b
--- /dev/null
+++ b/docs/topics/impala_truncate_table.xml
@@ -0,0 +1,151 @@
+<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="2.3.0 5.5.0" id="truncate_table">
+
+ <title>TRUNCATE TABLE Statement (CDH 5.5 or higher only)</title>
+ <titlealts><navtitle>TRUNCATE TABLE</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="Cloudera">TRUNCATE TABLE statement</indexterm>
+ Removes the data from an Impala table while leaving the table itself.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<!-- <codeblock>TRUNCATE TABLE [IF EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname></codeblock> -->
+<codeblock>TRUNCATE TABLE [<varname>db_name</varname>.]<varname>table_name</varname></codeblock>
+
+ <p conref="../shared/impala_common.xml#common/ddl_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ Often used to empty tables that are used during ETL cycles, after the data has been copied to another
+ table for the next stage of processing. This statement is a low-overhead alternative to dropping and
+ recreating the table, or using <codeph>INSERT OVERWRITE</codeph> to replace the data during the
+ next ETL cycle.
+ </p>
+
+ <p>
+ This statement removes all the data and associated data files in the table. It can remove data files from internal tables,
+ external tables, partitioned tables, and tables mapped to HBase or the Amazon Simple Storage Service (S3).
+ The data removal applies to the entire table, including all partitions of a partitioned table.
+ </p>
+
+ <p>
+ Any statistics produced by the <codeph>COMPUTE STATS</codeph> statement are reset when the data is removed.
+ </p>
+
+ <p>
+ Make sure that you are in the correct database before truncating a table, either by issuing a
+ <codeph>USE</codeph> statement first or by using a fully qualified name
+ <codeph><varname>db_name</varname>.<varname>table_name</varname></codeph>.
+ </p>
+
+<!-- IF EXISTS apparently not implemented for this first go-round. Filing a JIRA about that:
+ <p>
+ The optional <codeph>IF EXISTS</codeph> clause makes the statement succeed whether or not the table exists.
+ If the table does exist, it is truncated; if it does not exist, the statement has no effect. This capability is
+ useful in standardized setup scripts that are might be run both before and after some of the tables exist.
+ </p>
+-->
+
+ <p>
+ Any HDFS data files removed by this statement go into the HDFS trashcan, from which you can recover them
+ within a defined time interval if this operation turns out to be a mistake.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/disk_space_blurb"/>
+
+ <p conref="../shared/impala_common.xml#common/s3_blurb"/>
+ <p rev="2.2.0">
+ Although Impala cannot write new data to a table stored in the Amazon
+ S3 filesystem, the <codeph>TRUNCATE TABLE</codeph> statement can remove data files from S3.
+ See <xref href="impala_s3.xml#s3"/> for details about working with S3 tables.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+ <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
+ <p rev="CDH-19187">
+ The user ID that the <cmdname>impalad</cmdname> daemon runs under,
+ typically the <codeph>impala</codeph> user, must have write
+ permission for all the files and directories that make up the table.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following example shows a table containing some data and with table and column statistics.
+ After the <codeph>TRUNCATE TABLE</codeph> statement, the data is removed and the statistics
+ are reset.
+ </p>
+
+<codeblock>CREATE TABLE truncate_demo (x INT);
+INSERT INTO truncate_demo VALUES (1), (2), (4), (8);
+SELECT COUNT(*) FROM truncate_demo;
++----------+
+| count(*) |
++----------+
+| 4 |
++----------+
+COMPUTE STATS truncate_demo;
++-----------------------------------------+
+| summary |
++-----------------------------------------+
+| Updated 1 partition(s) and 1 column(s). |
++-----------------------------------------+
+SHOW TABLE STATS truncate_demo;
++-------+--------+------+--------------+-------------------+--------+-------------------+
+| #Rows | #Files | Size | Bytes Cached | Cache Replication | Format | Incremental stats |
++-------+--------+------+--------------+-------------------+--------+-------------------+
+| 4 | 1 | 8B | NOT CACHED | NOT CACHED | TEXT | false |
++-------+--------+------+--------------+-------------------+--------+-------------------+
+SHOW COLUMN STATS truncate_demo;
++--------+------+------------------+--------+----------+----------+
+| Column | Type | #Distinct Values | #Nulls | Max Size | Avg Size |
++--------+------+------------------+--------+----------+----------+
+| x | INT | 4 | -1 | 4 | 4 |
++--------+------+------------------+--------+----------+----------+
+
+-- After this statement, the data and the table/column stats will be gone.
+TRUNCATE TABLE truncate_demo;
+
+SELECT COUNT(*) FROM truncate_demo;
++----------+
+| count(*) |
++----------+
+| 0 |
++----------+
+SHOW TABLE STATS truncate_demo;
++-------+--------+------+--------------+-------------------+--------+-------------------+
+| #Rows | #Files | Size | Bytes Cached | Cache Replication | Format | Incremental stats |
++-------+--------+------+--------------+-------------------+--------+-------------------+
+| -1 | 0 | 0B | NOT CACHED | NOT CACHED | TEXT | false |
++-------+--------+------+--------------+-------------------+--------+-------------------+
+SHOW COLUMN STATS truncate_demo;
++--------+------+------------------+--------+----------+----------+
+| Column | Type | #Distinct Values | #Nulls | Max Size | Avg Size |
++--------+------+------------------+--------+----------+----------+
+| x | INT | -1 | -1 | 4 | 4 |
++--------+------+------------------+--------+----------+----------+
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+
+ <p>
+ <xref href="impala_tables.xml#tables"/>,
+ <xref href="impala_alter_table.xml#alter_table"/>, <xref href="impala_create_table.xml#create_table"/>,
+ <xref href="impala_partitioning.xml#partitioning"/>, <xref href="impala_tables.xml#internal_tables"/>,
+ <xref href="impala_tables.xml#external_tables"/>
+ </p>
+
+ </conbody>
+</concept>