You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@impala.apache.org by ta...@apache.org on 2017/01/30 17:00:28 UTC

[1/7] incubator-impala git commit: Updates to DML statements for Impala + Kudu

Repository: incubator-impala
Updated Branches:
  refs/heads/master f590bc0da -> dde559b31


Updates to DML statements for Impala + Kudu

Fill in syntax, usage notes, examples for
UPDATE, DELETE, UPSERT. Take out IGNORE from
INSERT.

Add 2nd syntax form and examples for DELETE.

Add join syntax to UPDATE.

Change-Id: I60512b7957fb53d86d3123a4f1d46fbb355f4665
Reviewed-on: http://gerrit.cloudera.org:8080/5646
Reviewed-by: Matthew Jacobs <mj...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/fc721fbd
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/fc721fbd
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/fc721fbd

Branch: refs/heads/master
Commit: fc721fbd0691e03cee3f2e77dadd71da263a6d72
Parents: f590bc0
Author: John Russell <jr...@cloudera.com>
Authored: Mon Jan 9 14:03:07 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Fri Jan 27 21:12:56 2017 +0000

----------------------------------------------------------------------
 docs/shared/impala_common.xml |  32 ++++++++-
 docs/topics/impala_delete.xml | 136 +++++++++++++++++++++++++++++++++----
 docs/topics/impala_insert.xml |  39 ++++++++---
 docs/topics/impala_update.xml | 130 ++++++++++++++++++++++++++++++-----
 docs/topics/impala_upsert.xml |  42 +++++++++---
 5 files changed, 330 insertions(+), 49 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/fc721fbd/docs/shared/impala_common.xml
----------------------------------------------------------------------
diff --git a/docs/shared/impala_common.xml b/docs/shared/impala_common.xml
index 8990dbb..7b3e697 100644
--- a/docs/shared/impala_common.xml
+++ b/docs/shared/impala_common.xml
@@ -2720,6 +2720,10 @@ select max(height), avg(height) from census_data where age &gt; 20;
         <xref href="../topics/impala_sync_ddl.xml#sync_ddl">SYNC_DDL</xref> query option)
       </p>
 
+      <p id="dml_blurb_kudu" rev="kudu">
+        <b>Statement type:</b> DML
+      </p>
+
       <p rev="1.2" id="sync_ddl_blurb">
         If you connect to different Impala nodes within an <cmdname>impala-shell</cmdname> session for
         load-balancing purposes, you can enable the <codeph>SYNC_DDL</codeph> query option to make each DDL
@@ -3691,14 +3695,38 @@ sudo pip-python install ssl</codeblock>
         around sharing content between the Impala documentation and the Kudu documentation.
       </p>
 
-      <p id="kudu_blurb">
+      <p id="kudu_blurb" rev="kudu 2.8.0">
         <b>Kudu considerations:</b>
       </p>
 
-      <p id="kudu_no_load_data">
+      <p id="kudu_no_load_data" rev="kudu">
         The <codeph>LOAD DATA</codeph> statement cannot be used with Kudu tables.
       </p>
 
+      <p id="kudu_no_truncate_table" rev="kudu">
+        Currently, the <codeph>TRUNCATE TABLE</codeph> statement cannot be used with Kudu tables.
+      </p>
+
+      <p id="kudu_no_insert_overwrite" rev="kudu">
+        Currently, the <codeph>INSERT OVERWRITE</codeph> syntax cannot be used with Kudu tables.
+      </p>
+
+      <p id="kudu_unsupported_data_type" rev="kudu">
+        Currently, the data types <codeph>DECIMAL</codeph>, <codeph>TIMESTAMP</codeph>, <codeph>CHAR</codeph>, <codeph>VARCHAR</codeph>,
+        <codeph>ARRAY</codeph>, <codeph>MAP</codeph>, and <codeph>STRUCT</codeph> cannot be used with Kudu tables.
+      </p>
+
+      <p id="kudu_non_pk_data_type" rev="kudu">
+        Currently, the data types <codeph>BOOLEAN</codeph>, <codeph>FLOAT</codeph>,
+        and <codeph>DOUBLE</codeph> cannot be used for primary key columns in Kudu tables.
+      </p>
+
+      <p id="pk_implies_not_null" rev="kudu">
+        Because all of the primary key columns must have non-null values, specifying a column
+        in the <codeph>PRIMARY KEY</codeph> clause implicitly adds the <codeph>NOT
+        NULL</codeph> attribute to that column.
+      </p>
+
     </section>
 
   </conbody>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/fc721fbd/docs/topics/impala_delete.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_delete.xml b/docs/topics/impala_delete.xml
index af20d19..c8591b6 100644
--- a/docs/topics/impala_delete.xml
+++ b/docs/topics/impala_delete.xml
@@ -18,7 +18,7 @@ specific language governing permissions and limitations
 under the License.
 -->
 <!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
-<concept id="delete">
+<concept id="delete" rev="kudu">
 
   <title>DELETE Statement (<keyword keyref="impala28"/> or higher only)</title>
   <titlealts audience="PDF"><navtitle>DELETE</navtitle></titlealts>
@@ -39,43 +39,149 @@ under the License.
 
     <p>
       <indexterm audience="hidden">DELETE statement</indexterm>
-      Deletes one or more rows from a Kudu table.
-      Although deleting a single row or a range of rows would be inefficient for tables using HDFS
-      data files, Kudu is able to perform this operation efficiently. Therefore, this statement
-      only works for Impala tables that use the Kudu storage engine.
+      Deletes an arbitrary number of rows from a Kudu table.
+      This statement only works for Impala tables that use the Kudu storage engine.
     </p>
 
     <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
 
 <codeblock>
+DELETE [FROM] [<varname>database_name</varname>.]<varname>table_name</varname> [ WHERE <varname>where_conditions</varname> ]
+
+DELETE <varname>table_ref</varname> FROM [<varname>joined_table_refs</varname>] [ WHERE <varname>where_conditions</varname> ]
 </codeblock>
 
-    <p rev="kudu">
-      Normally, a <codeph>DELETE</codeph> operation for a Kudu table fails if
-      some partition key columns are not found, due to their being deleted or changed
-      by a concurrent <codeph>UPDATE</codeph> or <codeph>DELETE</codeph> operation.
-      Specify <codeph>DELETE IGNORE <varname>rest_of_statement</varname></codeph> to
-      make the <codeph>DELETE</codeph> continue in this case. The rows with the nonexistent
-      duplicate partition key column values are not removed.
+    <p>
+      The first form evaluates rows from one table against an optional
+      <codeph>WHERE</codeph> clause, and deletes all the rows that
+      match the <codeph>WHERE</codeph> conditions, or all rows if
+      <codeph>WHERE</codeph> is omitted.
     </p>
 
-    <p conref="../shared/impala_common.xml#common/dml_blurb"/>
+    <p>
+      The second form evaluates one or more join clauses, and deletes
+      all matching rows from one of the tables. The join clauses can
+      include non-Kudu tables, but the table from which the rows
+      are deleted must be a Kudu table. The <codeph>FROM</codeph>
+      keyword is required in this case, to separate the name of
+      the table whose rows are being deleted from the table names
+      of the join clauses.
+    </p>
 
     <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
 
-    <p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/>
+    <p>
+      The conditions in the <codeph>WHERE</codeph> clause are the same ones allowed
+      for the <codeph>SELECT</codeph> statement. See <xref href="impala_select.xml#select"/>
+      for details.
+    </p>
+
+    <p>
+      The conditions in the <codeph>WHERE</codeph> clause can refer to
+      any combination of primary key columns or other columns. Referring to
+      primary key columns in the <codeph>WHERE</codeph> clause is more efficient
+      than referring to non-primary key columns.
+    </p>
+
+    <p>
+      If the <codeph>WHERE</codeph> clause is omitted, all rows are removed from the table.
+    </p>
+
+    <p>
+      Because Kudu currently does not enforce strong consistency during concurrent DML operations,
+      be aware that the results after this statement finishes might be different than you
+      intuitively expect:
+    </p>
+    <ul>
+      <li>
+        <p>
+          If some rows cannot be deleted because their
+          some primary key columns are not found, due to their being deleted
+          by a concurrent <codeph>DELETE</codeph> operation,
+          the statement succeeds but returns a warning.
+        </p>
+      </li>
+      <li>
+        <p>
+          A <codeph>DELETE</codeph> statement might also overlap with
+          <codeph>INSERT</codeph>, <codeph>UPDATE</codeph>,
+          or <codeph>UPSERT</codeph> statements running concurrently on the same table.
+          After the statement finishes, there might be more or fewer rows than expected in the table
+          because it is undefined whether the <codeph>DELETE</codeph> applies to rows that are
+          inserted or updated while the <codeph>DELETE</codeph> is in progress.
+        </p>
+      </li>
+    </ul>
+
+    <p>
+      The number of affected rows is reported in an <cmdname>impala-shell</cmdname> message
+      and in the query profile.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/dml_blurb_kudu"/>
 
     <note conref="../shared/impala_common.xml#common/compute_stats_next"/>
 
     <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+    <p>
+      The following examples show how to delete rows from a specified
+      table, either all rows or rows that match a <codeph>WHERE</codeph>
+      clause:
+    </p>
+
 <codeblock>
+-- Deletes all rows. The FROM keyword is optional.
+DELETE FROM kudu_table;
+DELETE kudu_table;
+
+-- Deletes 0, 1, or more rows.
+-- (If c1 is a single-column primary key, the statement could only
+-- delete 0 or 1 rows.)
+DELETE FROM kudu_table WHERE c1 = 100;
+
+-- Deletes all rows that match all the WHERE conditions.
+DELETE FROM kudu_table WHERE
+  (c1 > c2 OR c3 IN ('hello','world')) AND c4 IS NOT NULL;
+DELETE FROM t1 WHERE
+  (c1 IN (1,2,3) AND c2 > c3) OR c4 IS NOT NULL;
+DELETE FROM time_series WHERE
+  year = 2016 AND month IN (11,12) AND day > 15;
+
+-- WHERE condition with a subquery.
+DELETE FROM t1 WHERE
+  c5 IN (SELECT DISTINCT other_col FROM other_table);
+
+-- Does not delete any rows, because the WHERE condition is always false.
+DELETE FROM kudu_table WHERE 1 = 0;
+</codeblock>
+
+    <p>
+      The following examples show how to delete rows that are part
+      of the result set from a join:
+    </p>
+
+<codeblock>
+-- Remove _all_ rows from t1 that have a matching X value in t2.
+DELETE t1 FROM t1 JOIN t2 ON t1.x = t2.x;
+
+-- Remove _some_ rows from t1 that have a matching X value in t2.
+DELETE t1 FROM t1 JOIN t2 ON t1.x = t2.x
+  WHERE t1.y = FALSE and t2.z > 100;
+
+-- Delete from a Kudu table based on a join with a non-Kudu table.
+DELETE t1 FROM kudu_table t1 JOIN non_kudu_table t2 ON t1.x = t2.x;
 
+-- The tables can be joined in any order as long as the Kudu table
+-- is specified as the deletion target.
+DELETE t2 FROM non_kudu_table t1 JOIN kudu_table t2 ON t1.x = t2.x;
 </codeblock>
 
     <p conref="../shared/impala_common.xml#common/related_info"/>
 
     <p>
-      <xref href="impala_kudu.xml#impala_kudu"/>
+      <xref href="impala_kudu.xml#impala_kudu"/>, <xref href="impala_insert.xml#insert"/>,
+      <xref href="impala_update.xml#update"/>, <xref href="impala_upsert.xml#upsert"/>
     </p>
 
   </conbody>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/fc721fbd/docs/topics/impala_insert.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_insert.xml b/docs/topics/impala_insert.xml
index ed9f78f..7e7b76b 100644
--- a/docs/topics/impala_insert.xml
+++ b/docs/topics/impala_insert.xml
@@ -33,7 +33,7 @@ under the License.
       <data name="Category" value="Developers"/>
       <data name="Category" value="Tables"/>
       <data name="Category" value="S3"/>
-      <!-- <data name="Category" value="Kudu"/> -->
+      <data name="Category" value="Kudu"/>
       <!-- This is such an important statement, think if there are more applicable categories. -->
     </metadata>
   </prolog>
@@ -90,16 +90,39 @@ hint_clause ::= [SHUFFLE] | [NOSHUFFLE]    (Note: the square brackets are part o
       See <xref href="impala_complex_types.xml#complex_types"/> for details about working with complex types.
     </p>
 
+    <p conref="../shared/impala_common.xml#common/kudu_blurb"/>
+
+    <p conref="../shared/impala_common.xml#common/kudu_no_insert_overwrite"/>
+
     <p rev="kudu">
-      <b>Ignoring duplicate partition keys for Kudu tables (IGNORE clause)</b>
+      Kudu tables require a unique primary key for each row. If an <codeph>INSERT</codeph>
+      statement attempts to insert a row with the same values for the primary key columns
+      as an existing row, that row is discarded and the insert operation continues.
+      When rows are discarded due to duplicate primary keys, the statement finishes
+      with a warning, not an error. (This is a change from early releases of Kudu
+      where the default was to return in error in such cases, and the syntax
+      <codeph>INSERT IGNORE</codeph> was required to make the statement succeed.
+      The <codeph>IGNORE</codeph> clause is no longer part of the <codeph>INSERT</codeph>
+      syntax.)
     </p>
 
-    <p rev="kudu">
-      Normally, an <codeph>INSERT</codeph> operation into a Kudu table fails if
-      it would result in duplicate partition key columns for any rows.
-      Specify <codeph>INSERT IGNORE <varname>rest_of_statement</varname></codeph> to
-      make the <codeph>INSERT</codeph> continue in this case. The rows that would
-      have duplicate partition key columns are not inserted.
+    <p>
+      For situations where you prefer to replace rows with duplicate primary key values,
+      rather than discarding the new data, you can use the <codeph>UPSERT</codeph>
+      statement instead of <codeph>INSERT</codeph>. <codeph>UPSERT</codeph> inserts
+      rows that are entirely new, and for rows that match an existing primary key in the
+      table, the non-primary-key columns are updated to reflect the values in the
+      <q>upserted</q> data.
+    </p>
+
+    <p>
+      If you really want to store new rows, not replace existing ones, but cannot do so
+      because of the primary key uniqueness constraint, consider recreating the table
+      with additional columns included in the primary key.
+    </p>
+
+    <p>
+      See <xref href="impala_kudu.xml#impala_kudu"/> for more details about using Impala with Kudu.
     </p>
 
     <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/fc721fbd/docs/topics/impala_update.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_update.xml b/docs/topics/impala_update.xml
index 3f78a17..0d3250b 100644
--- a/docs/topics/impala_update.xml
+++ b/docs/topics/impala_update.xml
@@ -18,7 +18,7 @@ specific language governing permissions and limitations
 under the License.
 -->
 <!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
-<concept id="update">
+<concept id="update" rev="kudu">
 
   <title>UPDATE Statement (<keyword keyref="impala28"/> or higher only)</title>
   <titlealts audience="PDF"><navtitle>UPDATE</navtitle></titlealts>
@@ -39,43 +39,141 @@ under the License.
 
     <p>
       <indexterm audience="hidden">UPDATE statement</indexterm>
-      Updates one or more rows from a Kudu table.
-      Although updating a single row or a range of rows would be inefficient for tables using HDFS
-      data files, Kudu is able to perform this operation efficiently. Therefore, this statement
-      only works for Impala tables that use the Kudu storage engine.
+      Updates an arbitrary number of rows in a Kudu table.
+      This statement only works for Impala tables that use the Kudu storage engine.
     </p>
 
     <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
 
 <codeblock>
+UPDATE [<varname>database_name</varname>.]<varname>table_name</varname> SET <varname>col</varname> = <varname>val</varname> [, <varname>col</varname> = <varname>val</varname> ... ]
+  [ FROM <varname>joined_table_refs</varname> ]
+  [ WHERE <varname>where_conditions</varname> ]
 </codeblock>
 
-    <p rev="kudu">
-      Normally, an <codeph>UPDATE</codeph> operation for a Kudu table fails if
-      some partition key columns are not found, due to their being deleted or changed
-      by a concurrent <codeph>UPDATE</codeph> or <codeph>DELETE</codeph> operation.
-      Specify <codeph>UPDATE IGNORE <varname>rest_of_statement</varname></codeph> to
-      make the <codeph>UPDATE</codeph> continue in this case. The rows with the nonexistent
-      duplicate partition key column values are not changed.
+    <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+    <p>
+      None of the columns that make up the primary key can be updated by the
+      <codeph>SET</codeph> clause.
     </p>
 
-    <p conref="../shared/impala_common.xml#common/dml_blurb"/>
+    <p>
+      The conditions in the <codeph>WHERE</codeph> clause are the same ones allowed
+      for the <codeph>SELECT</codeph> statement. See <xref href="impala_select.xml#select"/>
+      for details.
+    </p>
 
-    <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+    <p>
+      If the <codeph>WHERE</codeph> clause is omitted, all rows in the table are updated.
+    </p>
+
+    <p>
+      The conditions in the <codeph>WHERE</codeph> clause can refer to
+      any combination of primary key columns or other columns. Referring to
+      primary key columns in the <codeph>WHERE</codeph> clause is more efficient
+      than referring to non-primary key columns.
+    </p>
+
+    <p>
+      Because Kudu currently does not enforce strong consistency during concurrent DML operations,
+      be aware that the results after this statement finishes might be different than you
+      intuitively expect:
+    </p>
+    <ul>
+      <li>
+        <p>
+          If some rows cannot be updated because their
+          some primary key columns are not found, due to their being deleted
+          by a concurrent <codeph>DELETE</codeph> operation,
+          the statement succeeds but returns a warning.
+        </p>
+      </li>
+      <li>
+        <p>
+          An <codeph>UPDATE</codeph> statement might also overlap with
+          <codeph>INSERT</codeph>, <codeph>UPDATE</codeph>,
+          or <codeph>UPSERT</codeph> statements running concurrently on the same table.
+          After the statement finishes, there might be more or fewer matching rows than expected
+          in the table because it is undefined whether the <codeph>UPDATE</codeph> applies to rows
+          that are inserted or updated while the <codeph>UPDATE</codeph> is in progress.
+        </p>
+      </li>
+    </ul>
+
+    <p>
+      The number of affected rows is reported in an <cmdname>impala-shell</cmdname> message
+      and in the query profile.
+    </p>
+
+    <p>
+      The optional <codeph>FROM</codeph> clause lets you restrict the
+      updates to only the rows in the specified table that are part
+      of the result set for a join query. The join clauses can include
+      non-Kudu tables, but the table from which the rows are deleted
+      must be a Kudu table.
+    </p>
 
-    <p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/>
+    <p conref="../shared/impala_common.xml#common/dml_blurb_kudu"/>
 
     <note conref="../shared/impala_common.xml#common/compute_stats_next"/>
 
     <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+    <p>
+      The following examples show how to perform a simple update
+      on a table, with or without a <codeph>WHERE</codeph> clause:
+    </p>
+
 <codeblock>
+-- Set all rows to the same value for column c3.
+-- In this case, c1 and c2 are primary key columns
+-- and so cannot be updated.
+UPDATE kudu_table SET c3 = 'not applicable';
 
+-- Update only the rows that match the condition.
+UPDATE kudu_table SET c3 = NULL WHERE c1 > 100 AND c3 IS NULL;
+
+-- Does not update any rows, because the WHERE condition is always false.
+UPDATE kudu_table SET c3 = 'impossible' WHERE 1 = 0;
+
+-- Change the values of multiple columns in a single UPDATE statement.
+UPDATE kudu_table SET c3 = upper(c3), c4 = FALSE, c5 = 0 WHERE c6 = TRUE;
+</codeblock>
+
+    <p>
+      The following examples show how to perform an update using the
+      <codeph>FROM</codeph> keyword with a join clause:
+    </p>
+
+<codeblock>
+-- Uppercase a column value, only for rows that have
+-- an ID that matches the value from another table.
+UPDATE kudu_table SET c3 = upper(c3)
+  FROM kudu_table JOIN non_kudu_table
+  ON kudu_table.id = non_kudu_table.id;
+
+-- Same effect as previous statement.
+-- Assign table aliases in FROM clause, then refer to
+-- short names elsewhere in the statement.
+UPDATE t1 SET c3 = upper(c3)
+  FROM kudu_table t1 JOIN non_kudu_table t2
+  ON t1.id = t2.id;
+
+-- Same effect as previous statements, but more efficient.
+-- Use WHERE clause to skip updating values that are
+-- already uppercase.
+UPDATE t1 SET c3 = upper(c3)
+  FROM kudu_table t1 JOIN non_kudu_table t2
+  ON t1.id = t2.id
+  WHERE c3 != upper(c3);
 </codeblock>
 
     <p conref="../shared/impala_common.xml#common/related_info"/>
 
     <p>
-      <xref href="impala_kudu.xml#impala_kudu"/>
+      <xref href="impala_kudu.xml#impala_kudu"/>, <xref href="impala_insert.xml#insert"/>,
+      <xref href="impala_delete.xml#delete"/>, <xref href="impala_upsert.xml#upsert"/>
     </p>
 
   </conbody>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/fc721fbd/docs/topics/impala_upsert.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_upsert.xml b/docs/topics/impala_upsert.xml
index d4e880f..5830675 100644
--- a/docs/topics/impala_upsert.xml
+++ b/docs/topics/impala_upsert.xml
@@ -18,7 +18,7 @@ specific language governing permissions and limitations
 under the License.
 -->
 <!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
-<concept id="upsert">
+<concept id="upsert" rev="kudu IMPALA-3725">
 
   <title>UPSERT Statement (<keyword keyref="impala28"/> or higher only)</title>
   <titlealts audience="PDF"><navtitle>UPSERT</navtitle></titlealts>
@@ -62,28 +62,54 @@ under the License.
     </p>
 
     <p>
-      Although inserting or updating a single row or a small set of rows would be inefficient for tables using HDFS
-      data files, Kudu is able to perform this operation efficiently.
-      Therefore, this statement only works for Impala tables that use the
-      Kudu storage engine.
+      This statement only works for Impala tables that use the Kudu storage engine.
     </p>
 
     <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
 
 <codeblock>
+UPSERT INTO [TABLE] [<varname>db_name</varname>.]<varname>table_name</varname>
+  [(<varname>column_list</varname>)]
+{
+    [<varname>hint_clause</varname>] <varname>select_statement</varname>
+  | VALUES (<varname>value</varname> [, <varname>value</varname> ...]) [, (<varname>value</varname> [, <varname>value</varname> ...]) ...]
+}
+
+hint_clause ::= [SHUFFLE] | [NOSHUFFLE]
+  (Note: the square brackets are part of the syntax.)
 </codeblock>
 
-    <p conref="../shared/impala_common.xml#common/dml_blurb"/>
+    <p>
+      The <varname>select_statement</varname> clause can use the full syntax, such as
+      <codeph>WHERE</codeph> and join clauses, as <xref href="impala_select.xml#select"/>.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/dml_blurb_kudu"/>
 
     <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
 
-    <p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/>
+    <p>
+      If you specify a column list, any omitted columns in the inserted or updated rows are
+      set to their default value (if the column has one) or <codeph>NULL</codeph> (if the
+      column does not have a default value). Therefore, if a column is not nullable and
+      has no default value, it must be included in the column list for any <codeph>UPSERT</codeph>
+      statement. Because all primary key columns meet these conditions, all the primary key
+      columns must be specified in every <codeph>UPSERT</codeph> statement.
+    </p>
+
+    <p>
+      Because Kudu tables can efficiently handle small incremental changes, the <codeph>VALUES</codeph>
+      clause is more practical to use with Kudu tables than with HDFS-based tables.
+    </p>
 
     <note conref="../shared/impala_common.xml#common/compute_stats_next"/>
 
     <p conref="../shared/impala_common.xml#common/example_blurb"/>
-<codeblock>
 
+<codeblock>
+UPSERT INTO kudu_table (pk, c1, c2, c3) VALUES (0, 'hello', 50, true), (1, 'world', -1, false);
+UPSERT INTO production_table SELECT * FROM staging_table;
+UPSERT INTO production_table SELECT * FROM staging_table WHERE c1 IS NOT NULL AND c2 > 0;
 </codeblock>
 
     <p conref="../shared/impala_common.xml#common/related_info"/>

[2/7] incubator-impala git commit: [DOCS] Add doc for MT_DOP query option.

Posted by ta...@apache.org.

[DOCS] Add doc for MT_DOP query option.

Add info about MT_DOP default to COMPUTE STATS.

Change-Id: Ife2786532b425af6d230074f1c0b5c7dcb2b8a92
Reviewed-on: http://gerrit.cloudera.org:8080/5652
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/6a95f420
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/6a95f420
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/6a95f420

Branch: refs/heads/master
Commit: 6a95f42022300540f485efda0252e1fe85f07823
Parents: fc721fb
Author: John Russell <jr...@cloudera.com>
Authored: Mon Jan 9 16:40:02 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Fri Jan 27 23:49:37 2017 +0000

----------------------------------------------------------------------
 docs/impala.ditamap                  |   1 +
 docs/impala_keydefs.ditamap          |   2 +-
 docs/shared/impala_common.xml        |   4 +
 docs/topics/impala_compute_stats.xml |   9 ++
 docs/topics/impala_mt_dop.xml        | 208 ++++++++++++++++++++++++++++++
 5 files changed, 223 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6a95f420/docs/impala.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala.ditamap b/docs/impala.ditamap
index 46b8c7f..172319e 100644
--- a/docs/impala.ditamap
+++ b/docs/impala.ditamap
@@ -194,6 +194,7 @@ under the License.
           <topicref href="topics/impala_max_scan_range_length.xml"/>
           <topicref rev="2.5.0" href="topics/impala_max_num_runtime_filters.xml"/>
           <topicref href="topics/impala_mem_limit.xml"/>
+          <topicref rev="2.8.0" href="topics/impala_mt_dop.xml"/>
           <topicref href="topics/impala_num_nodes.xml"/>
           <topicref href="topics/impala_num_scanner_threads.xml"/>
           <topicref rev="2.5.0" href="topics/impala_optimize_partition_key_scans.xml"/>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6a95f420/docs/impala_keydefs.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala_keydefs.ditamap b/docs/impala_keydefs.ditamap
index bee6672..da62dd1 100644
--- a/docs/impala_keydefs.ditamap
+++ b/docs/impala_keydefs.ditamap
@@ -793,7 +793,7 @@ https://issues.cloudera.org/secure/IssueNavigator.jspa?reset=true&amp;jqlQuery=p
   <keydef href="topics/impala_max_scan_range_length.xml" keys="max_scan_range_length"/>
   <keydef href="topics/impala_max_num_runtime_filters.xml" keys="max_num_runtime_filters"/>
   <keydef href="topics/impala_mem_limit.xml" keys="mem_limit"/>
-  <!-- <keydef href="topics/impala_mt_dop.xml" keys="mt_dop"/> -->
+  <keydef href="topics/impala_mt_dop.xml" keys="mt_dop"/>
   <keydef href="topics/impala_num_nodes.xml" keys="num_nodes"/>
   <keydef href="topics/impala_num_scanner_threads.xml" keys="num_scanner_threads"/>
   <keydef href="topics/impala_optimize_partition_key_scans.xml" keys="optimize_partition_key_scans"/>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6a95f420/docs/shared/impala_common.xml
----------------------------------------------------------------------
diff --git a/docs/shared/impala_common.xml b/docs/shared/impala_common.xml
index 7b3e697..4309e84 100644
--- a/docs/shared/impala_common.xml
+++ b/docs/shared/impala_common.xml
@@ -1581,6 +1581,10 @@ explain select s from yy2 where year in (select year from yy where year between
         <b>Default:</b> <codeph>false</codeph>
       </p>
 
+      <p id="default_0">
+        <b>Default:</b> <codeph>0</codeph>
+      </p>
+
       <p id="default_false_0">
         <b>Default:</b> <codeph>false</codeph> (shown as 0 in output of <codeph>SET</codeph> statement)
       </p>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6a95f420/docs/topics/impala_compute_stats.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_compute_stats.xml b/docs/topics/impala_compute_stats.xml
index 5a15c72..91f45c2 100644
--- a/docs/topics/impala_compute_stats.xml
+++ b/docs/topics/impala_compute_stats.xml
@@ -110,6 +110,15 @@ COMPUTE INCREMENTAL STATS [<varname>db_name</varname>.]<varname>table_name</varn
           The statistics help Impala to achieve high concurrency, full utilization of available memory, and avoid
           contention with workloads from other Hadoop components.
         </li>
+        <li rev="IMPALA-4572">
+          In <keyword keyref="impala28_full"/> and higher, when you run the
+          <codeph>COMPUTE STATS</codeph> or <codeph>COMPUTE INCREMENTAL STATS</codeph>
+          statement against a Parquet table, Impala automatically applies the query
+          option setting <codeph>MT_DOP=4</codeph> to increase the amount of intra-node
+          parallelism during this CPU-intensive operation. See <xref keyref="mt_dop"/>
+          for details about what this query option does and how to use it with
+          CPU-intensive <codeph>SELECT</codeph> statements.
+        </li>
       </ul>
     </note>
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6a95f420/docs/topics/impala_mt_dop.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_mt_dop.xml b/docs/topics/impala_mt_dop.xml
new file mode 100644
index 0000000..04fb1c0
--- /dev/null
+++ b/docs/topics/impala_mt_dop.xml
@@ -0,0 +1,208 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="mt_dop">
+
+  <title>MT_DOP Query Option</title>
+  <titlealts audience="PDF"><navtitle>MT_DOP</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Querying"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="hidden">MT_DOP query option</indexterm>
+      Sets the degree of intra-node parallelism used for certain operations that
+      can benefit from multithreaded execution. You can specify values
+      higher than zero to find the ideal balance of response time,
+      memory usage, and CPU usage during statement processing.
+    </p>
+
+    <note>
+      <p>
+        The Impala execution engine is being revamped incrementally to add
+        additional parallelism within a single host for certain statements and
+        kinds of operations. The setting <codeph>MT_DOP=0</codeph> uses the
+        <q>old</q> code path with limited intra-node parallelism.
+      </p>
+
+      <p>
+        Currently, the operations affected by the <codeph>MT_DOP</codeph>
+        query option are:
+      </p>
+      <ul>
+        <li>
+          <p>
+            <codeph>COMPUTE [INCREMENTAL] STATS</codeph>. Impala automatically sets
+            <codeph>MT_DOP=4</codeph> for <codeph>COMPUTE STATS</codeph> and
+            <codeph>COMPUTE INCREMENTAL STATS</codeph> statements on Parquet tables.
+          </p>
+        </li>
+        <li>
+          <p>
+            Queries with execution plans containing only scan and aggregation operators,
+            or local joins that do not need data exchanges (such as for nested types).
+            Other queries produce an error if <codeph>MT_DOP</codeph> is set to a non-zero
+            value. Therefore, this query option is typically only set for the duration of
+            specific long-running, CPU-intensive queries.
+          </p>
+        </li>
+      </ul>
+
+    </note>
+
+    <p conref="../shared/impala_common.xml#common/type_integer"/>
+    <p conref="../shared/impala_common.xml#common/default_0"/>
+    <p>
+      Because <codeph>COMPUTE STATS</codeph> and <codeph>COMPUTE INCREMENTAL STATS</codeph>
+      statements for Parquet tables benefit substantially from extra intra-node
+      parallelism, Impala automatically sets <codeph>MT_DOP=4</codeph> when computing stats
+      for Parquet tables.
+    </p>
+    <p>
+      <b>Range:</b> 0 to 64
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+    <note>
+      <p>
+        Any timing figures in the following examples are on a small, lightly loaded development cluster.
+        Your mileage may vary. Speedups depend on many factors, including the number of rows, columns, and
+        partitions within each table.
+      </p>
+    </note>
+
+    <p>
+      The following example shows how to run a <codeph>COMPUTE STATS</codeph>
+      statement against a Parquet table with or without an explicit <codeph>MT_DOP</codeph>
+      setting:
+    </p>
+
+<codeblock><![CDATA[
+-- Explicitly setting MT_DOP to 0 selects the old code path.
+set mt_dop = 0;
+MT_DOP set to 0
+
+-- The analysis for the billion rows is distributed among hosts,
+-- but uses only a single core on each host.
+compute stats billion_rows_parquet;
++-----------------------------------------+
+| summary                                 |
++-----------------------------------------+
+| Updated 1 partition(s) and 2 column(s). |
++-----------------------------------------+
+
+drop stats billion_rows_parquet;
+
+-- Using 4 logical processors per host is faster.
+set mt_dop = 4;
+MT_DOP set to 4
+
+compute stats billion_rows_parquet;
++-----------------------------------------+
+| summary                                 |
++-----------------------------------------+
+| Updated 1 partition(s) and 2 column(s). |
++-----------------------------------------+
+
+drop stats billion_rows_parquet;
+
+-- Unsetting the option reverts back to its default.
+-- Which for COMPUTE STATS and a Parquet table is 4,
+-- so again it uses the fast path.
+unset MT_DOP;
+Unsetting option MT_DOP
+
+compute stats billion_rows_parquet;
++-----------------------------------------+
+| summary                                 |
++-----------------------------------------+
+| Updated 1 partition(s) and 2 column(s). |
++-----------------------------------------+
+]]>
+</codeblock>
+
+    <p>
+      The following example shows the effects of setting <codeph>MT_DOP</codeph>
+      for a query involving only scan and aggregation operations for a Parquet table:
+    </p>
+
+<codeblock><![CDATA[
+set mt_dop = 0;
+MT_DOP set to 0
+
+-- COUNT(DISTINCT) for a unique column is CPU-intensive.
+select count(distinct id) from billion_rows_parquet;
++--------------------+
+| count(distinct id) |
++--------------------+
+| 1000000000         |
++--------------------+
+Fetched 1 row(s) in 67.20s
+
+set mt_dop = 16;
+MT_DOP set to 16
+
+-- Introducing more intra-node parallelism for the aggregation
+-- speeds things up, and potentially reduces memory overhead by
+-- reducing the number of scanner threads.
+select count(distinct id) from billion_rows_parquet;
++--------------------+
+| count(distinct id) |
++--------------------+
+| 1000000000         |
++--------------------+
+Fetched 1 row(s) in 17.19s
+]]>
+</codeblock>
+
+    <p>
+      The following example shows how queries that are not compatible with non-zero
+      <codeph>MT_DOP</codeph> settings produce an error when <codeph>MT_DOP</codeph>
+      is set:
+    </p>
+
+<codeblock><![CDATA[
+set mt_dop=1;
+MT_DOP set to 1
+
+select * from a1 inner join a2
+  on a1.id = a2.id limit 4;
+ERROR: NotImplementedException: MT_DOP not supported for plans with
+  base table joins or table sinks.
+]]>
+</codeblock>
+
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+    <p>
+      <xref keyref="compute_stats"/>,
+      <xref keyref="aggregate_functions"/>
+    </p>
+
+  </conbody>
+</concept>

[7/7] incubator-impala git commit: Update copyright year to 2017

Posted by ta...@apache.org.

Update copyright year to 2017

Change-Id: I6a815c67176fc2a08ed693b581fac4c6919f297c
Reviewed-on: http://gerrit.cloudera.org:8080/5754
Reviewed-by: Michael Ho <kw...@cloudera.com>
Tested-by: Jim Apple <jb...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/dde559b3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/dde559b3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/dde559b3

Branch: refs/heads/master
Commit: dde559b3190a0cbe381352abfe258cba326b4213
Parents: 7b8ffd3
Author: Jim Apple <jb...@apache.org>
Authored: Fri Jan 20 20:37:04 2017 -0800
Committer: Jim Apple <jb...@apache.org>
Committed: Sun Jan 29 00:01:03 2017 +0000

----------------------------------------------------------------------
 NOTICE.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/dde559b3/NOTICE.txt
----------------------------------------------------------------------
diff --git a/NOTICE.txt b/NOTICE.txt
index 54c89ce..6feed69 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -1,5 +1,5 @@
 Apache Impala (incubating)
-Copyright 2016 The Apache Software Foundation
+Copyright 2017 The Apache Software Foundation
 
 This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).

[4/7] incubator-impala git commit: IMPALA-4778 IMPALA-1972: Add known issue

Posted by ta...@apache.org.

IMPALA-4778 IMPALA-1972: Add known issue

Change-Id: I46ee3489cb161d9f8dd2852f2c293f93edf0e6aa
Reviewed-on: http://gerrit.cloudera.org:8080/5747
Tested-by: Impala Public Jenkins
Reviewed-by: John Russell <jr...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/59dfa1f8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/59dfa1f8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/59dfa1f8

Branch: refs/heads/master
Commit: 59dfa1f8f6fa81cb3a085d6ceab0ad126a3a0c2e
Parents: 98ff43c
Author: John Russell <jr...@cloudera.com>
Authored: Thu Jan 19 10:49:47 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Fri Jan 27 23:52:10 2017 +0000

----------------------------------------------------------------------
 docs/topics/impala_known_issues.xml | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/59dfa1f8/docs/topics/impala_known_issues.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_known_issues.xml b/docs/topics/impala_known_issues.xml
index a910ef0..b76545c 100644
--- a/docs/topics/impala_known_issues.xml
+++ b/docs/topics/impala_known_issues.xml
@@ -141,6 +141,31 @@ https://issues.cloudera.org/browse/IMPALA-2144 - Don't have
 
     </conbody>
 
+    <concept id="IMPALA-1972" rev="IMPALA-1972">
+
+      <title>Queries that take a long time to plan can cause webserver to block other queries</title>
+
+      <conbody>
+
+        <p>
+          Trying to get the details of a query through the debug web page
+          while the query is planning will block new queries that had not
+          started when the web page was requested. The web UI becomes
+          unresponsive until the planning phase is finished.
+        </p>
+
+        <p>
+          <b>Bug:</b> <xref keyref="IMPALA-1972">IMPALA-1972</xref>
+        </p>
+
+        <p>
+          <b>Severity:</b> High
+        </p>
+
+      </conbody>
+
+    </concept>
+
     <concept id="IMPALA-3069" rev="IMPALA-3069">
 
       <title>Setting BATCH_SIZE query option too large can cause a crash</title>

[3/7] incubator-impala git commit: IMPALA-4390: Separate ADD and DROP PARTITION syntax

Posted by ta...@apache.org.

IMPALA-4390: Separate ADD and DROP PARTITION syntax

Keep the location and cache options with ADD PARTITION,
and PURGE with DROP PARTITION.

Change-Id: I07458d9851bb2ab48311001f696353f834c4fe80
Reviewed-on: http://gerrit.cloudera.org:8080/5729
Reviewed-by: Dimitris Tsirogiannis <dt...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/98ff43c6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/98ff43c6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/98ff43c6

Branch: refs/heads/master
Commit: 98ff43c68dd0432ba108d195229e5a5f5c345d2d
Parents: 6a95f42
Author: John Russell <jr...@cloudera.com>
Authored: Wed Jan 18 10:12:49 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Fri Jan 27 23:52:05 2017 +0000

----------------------------------------------------------------------
 docs/topics/impala_alter_table.xml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/98ff43c6/docs/topics/impala_alter_table.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_alter_table.xml b/docs/topics/impala_alter_table.xml
index 950ebc6..c4df150 100644
--- a/docs/topics/impala_alter_table.xml
+++ b/docs/topics/impala_alter_table.xml
@@ -60,7 +60,13 @@ ALTER TABLE <varname>name</varname> DROP [COLUMN] <varname>column_name</varname>
 ALTER TABLE <varname>name</varname> CHANGE <varname>column_name</varname> <varname>new_name</varname> <varname>new_type</varname>
 ALTER TABLE <varname>name</varname> REPLACE COLUMNS (<varname>col_spec</varname>[, <varname>col_spec</varname> ...])
 
-ALTER TABLE <varname>name</varname> { ADD [IF NOT EXISTS] | DROP [IF EXISTS] } PARTITION (<varname>partition_spec</varname>) <ph rev="2.3.0">[PURGE]</ph>
+ALTER TABLE <varname>name</varname> ADD [IF NOT EXISTS] PARTITION (<varname>partition_spec</varname>)
+  <ph rev="IMPALA-4390">[<varname>location_spec</varname>]</ph>
+  <ph rev="IMPALA-4390">[<varname>cache_spec</varname>]</ph>
+
+ALTER TABLE <varname>name</varname> DROP [IF EXISTS] PARTITION (<varname>partition_spec</varname>)
+  <ph rev="2.3.0">[PURGE]</ph>
+
 <ph rev="2.3.0 IMPALA-1568 CDH-36799">ALTER TABLE <varname>name</varname> RECOVER PARTITIONS</ph>
 
 ALTER TABLE <varname>name</varname> [PARTITION (<varname>partition_spec</varname>)]

[6/7] incubator-impala git commit: IMPALA-4789: Fix slow metadata loading due to inconsistent paths.

Posted by ta...@apache.org.

IMPALA-4789: Fix slow metadata loading due to inconsistent paths.

The fix for IMPALA-4172/IMPALA-3653 introduced a performance
regression for loading tables that have many partitions with:
1. Inconsistent HDFS path qualification or
2. A custom location (not under the table root dir)

For the first issue consider a table whose root path is at
'hdfs://localhost:8020/warehouse/tbl/'.
A partition with an unqualified location '/warehouse/tbl/p=1'
will not be recognized as being a descendant of the table root
dir by FileSystemUtil.isDescendentPath() because of how
Path.equals() behaves, even if 'hdfs://localhost:8020' is the
default filesystem. Such partitions are incorrectly recognized
as having a custom location and are loaded separately.

There were two performance issues:
1. The code for loading the files/blocks of partitions with
   truly custom locations was inefficient with an O(N^2)
   loop for determining the target partitions.
2. Each partition that is incorrectly identified as having
   a custom path (e.g. due to inconsistent qualification),
   is going to have its files/blocks loaded twice. Once
   when the table root path is processed, and once when the
   'custom' partition is processed.

This patch fixes the detection of partitions with custom
locations, and improves the speed of loading partitions
with custom locations.

Change-Id: I8c881b7cb155032b82fba0e29350ca31de388d55
Reviewed-on: http://gerrit.cloudera.org:8080/5743
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/7b8ffd35
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/7b8ffd35
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/7b8ffd35

Branch: refs/heads/master
Commit: 7b8ffd35534c11ae3caa048229effc97613cd34f
Parents: a0ec519
Author: Alex Behm <al...@cloudera.com>
Authored: Thu Jan 19 00:22:47 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Sat Jan 28 09:22:09 2017 +0000

----------------------------------------------------------------------
 .../org/apache/impala/catalog/HdfsTable.java    | 58 ++++++++++++++------
 .../apache/impala/common/FileSystemUtil.java    | 19 ++++++-
 2 files changed, 59 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/7b8ffd35/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
index a6d0f47..795dae2 100644
--- a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
@@ -246,6 +246,8 @@ public class HdfsTable extends Table {
    *   file under it recursively.
    * - For every valid data file, map it to a partition from 'partsByPath' (if one exists)
    *   and enumerate all its blocks and their corresponding hosts and disk IDs.
+   * Requires that 'dirPath' and all paths in 'partsByPath' have consistent qualification
+   * (either fully qualified or unqualified), for isDescendantPath().
    * TODO: Split this method into more logical methods for cleaner code.
    */
   private void loadBlockMetadata(Path dirPath,
@@ -257,15 +259,29 @@ public class HdfsTable extends Table {
       if (LOG.isTraceEnabled()) {
         LOG.trace("Loading block md for " + name_ + " directory " + dirPath.toString());
       }
-      // Clear the state of partitions under dirPath since they are now updated based
-      // on the current snapshot of files in the directory.
-      for (Map.Entry<Path, List<HdfsPartition>> entry: partsByPath.entrySet()) {
-        Path partDir = entry.getKey();
-        if (!FileSystemUtil.isDescendantPath(partDir, dirPath)) continue;
-        for (HdfsPartition partition: entry.getValue()) {
+
+      // Clear the state of partitions under dirPath since they are going to be updated
+      // based on the current snapshot of files in the directory.
+      List<HdfsPartition> dirPathPartitions = partsByPath.get(dirPath);
+      if (dirPathPartitions != null) {
+        // The dirPath is a partition directory. This means the path is the root of an
+        // unpartitioned table, or the path of at least one partition.
+        for (HdfsPartition partition: dirPathPartitions) {
           partition.setFileDescriptors(new ArrayList<FileDescriptor>());
         }
+      } else {
+        // The dirPath is not a partition directory. We expect it to be an ancestor of
+        // partition paths (e.g., the table root). Clear all partitions whose paths are
+        // a descendant of dirPath.
+        for (Map.Entry<Path, List<HdfsPartition>> entry: partsByPath.entrySet()) {
+          Path partDir = entry.getKey();
+          if (!FileSystemUtil.isDescendantPath(partDir, dirPath)) continue;
+          for (HdfsPartition partition: entry.getValue()) {
+            partition.setFileDescriptors(new ArrayList<FileDescriptor>());
+          }
+        }
       }
+
       // For file systems that do not support BlockLocation API, we manually synthesize
       // block location metadata based on file formats.
       if (!FileSystemUtil.supportsStorageIds(fs)) {
@@ -671,7 +687,8 @@ public class HdfsTable extends Table {
     // using createPartition() calls. A single partition path can correspond to multiple
     // partitions.
     HashMap<Path, List<HdfsPartition>> partsByPath = Maps.newHashMap();
-    Path tblLocation = getHdfsBaseDirPath();
+    // Qualify to ensure isDescendantPath() works correctly.
+    Path tblLocation = FileSystemUtil.createFullyQualifiedPath(getHdfsBaseDirPath());
     // List of directories that we scan for block locations. We optimize the block metadata
     // loading to reduce the number of RPCs to the NN by separately loading partitions
     // with default directory paths (under the base table directory) and non-default
@@ -681,7 +698,7 @@ public class HdfsTable extends Table {
     // TODO: We can still do some advanced optimization by grouping all the partition
     // directories under the same ancestor path up the tree.
     List<Path> dirsToLoad = Lists.newArrayList(tblLocation);
-    FileSystem fs = tblLocation.getFileSystem(CONF);
+
     if (msTbl.getPartitionKeysSize() == 0) {
       Preconditions.checkArgument(msPartitions == null || msPartitions.isEmpty());
       // This table has no partition key, which means it has no declared partitions.
@@ -692,6 +709,7 @@ public class HdfsTable extends Table {
       partsByPath.put(tblLocation, Lists.newArrayList(part));
       if (isMarkedCached_) part.markCached();
       addPartition(part);
+      FileSystem fs = tblLocation.getFileSystem(CONF);
       if (fs.exists(tblLocation)) {
         accessLevel_ = getAvailableAccessLevel(fs, tblLocation);
       }
@@ -714,13 +732,17 @@ public class HdfsTable extends Table {
           // WRITE_ONLY the table's access level should be NONE.
           accessLevel_ = TAccessLevel.READ_ONLY;
         }
-        Path partDir = new Path(msPartition.getSd().getLocation());
+
+        // Qualify to ensure isDescendantPath() works correctly.
+        Path partDir = FileSystemUtil.createFullyQualifiedPath(
+            new Path(msPartition.getSd().getLocation()));
         List<HdfsPartition> parts = partsByPath.get(partDir);
         if (parts == null) {
           partsByPath.put(partDir, Lists.newArrayList(partition));
         } else {
           parts.add(partition);
         }
+
         if (!dirsToLoad.contains(partDir) &&
             !FileSystemUtil.isDescendantPath(partDir, tblLocation)) {
           // This partition has a custom filesystem location. Load its file/block
@@ -734,10 +756,10 @@ public class HdfsTable extends Table {
   }
 
   private void loadMetadataAndDiskIds(HdfsPartition partition) throws CatalogException {
-      Path partDirPath = partition.getLocationPath();
-      HashMap<Path, List<HdfsPartition>> partsByPath = Maps.newHashMap();
-      partsByPath.put(partDirPath, Lists.newArrayList(partition));
-      loadMetadataAndDiskIds(Lists.newArrayList(partDirPath), partsByPath);
+    Path partDirPath = partition.getLocationPath();
+    HashMap<Path, List<HdfsPartition>> partsByPath = Maps.newHashMap();
+    partsByPath.put(partDirPath, Lists.newArrayList(partition));
+    loadMetadataAndDiskIds(Lists.newArrayList(partDirPath), partsByPath);
   }
 
   /**
@@ -747,11 +769,13 @@ public class HdfsTable extends Table {
    */
   private void loadMetadataAndDiskIds(List<Path> locations,
       HashMap<Path, List<HdfsPartition>> partsByPath) {
-    LOG.info(String.format("Loading file and block metadata for %s partitions: %s",
-        partsByPath.size(), getFullName()));
+    LOG.info(String.format(
+        "Loading file and block metadata for %s partitions from %s paths: %s",
+        partsByPath.size(), locations.size(), getFullName()));
     for (Path location: locations) { loadBlockMetadata(location, partsByPath); }
-    LOG.info(String.format("Loaded file and block metadata for %s partitions: %s",
-        partsByPath.size(), getFullName()));
+    LOG.info(String.format(
+        "Loaded file and block metadata for %s partitions from %s paths: %s",
+        partsByPath.size(), locations.size(), getFullName()));
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/7b8ffd35/fe/src/main/java/org/apache/impala/common/FileSystemUtil.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/common/FileSystemUtil.java b/fe/src/main/java/org/apache/impala/common/FileSystemUtil.java
index 4767837..f8c50b4 100644
--- a/fe/src/main/java/org/apache/impala/common/FileSystemUtil.java
+++ b/fe/src/main/java/org/apache/impala/common/FileSystemUtil.java
@@ -37,6 +37,7 @@ import org.apache.hadoop.hdfs.protocol.EncryptionZone;
 import org.apache.impala.catalog.HdfsCompression;
 import org.apache.log4j.Logger;
 
+import com.google.common.base.Objects;
 import com.google.common.base.Preconditions;
 
 /**
@@ -424,12 +425,28 @@ public class FileSystemUtil {
 
   /**
    * Returns true if Path 'p' is a descendant of Path 'parent', false otherwise.
+   * This function relies on Path.equals() which requires paths to have the same
+   * schema and authority to compare equal. So both 'p' and 'parent' should either
+   * be qualified or unqualified paths for this function to behave as expected.
    */
   public static boolean isDescendantPath(Path p, Path parent) {
     if (p == null || parent == null) return false;
     while (!p.isRoot() && p.depth() != parent.depth()) p = p.getParent();
     if (p.isRoot()) return false;
-    return p.equals(parent);
+    boolean result = p.equals(parent);
+    if (!result && LOG.isTraceEnabled()) {
+      // Add a message to the log if 'p' and 'parent' have inconsistent qualification.
+      URI pUri = p.toUri();
+      URI parentUri = parent.toUri();
+      boolean sameScheme = Objects.equal(pUri.getScheme(), parentUri.getScheme());
+      boolean sameAuthority =
+          Objects.equal(pUri.getAuthority(), parentUri.getAuthority());
+      if (!sameScheme || !sameAuthority) {
+        LOG.trace("Inconsistent schema or authority for paths: " +
+            p.toString() + " " + parent.toString());
+      }
+    }
+    return result;
   }
 
   /**

[5/7] incubator-impala git commit: Add "Known Issues" item for IMPALA-4828.

Posted by ta...@apache.org.

Add "Known Issues" item for IMPALA-4828.

Change-Id: If287257639589c8bd2f56a1b5de4ac1a0bb2f082
Reviewed-on: http://gerrit.cloudera.org:8080/5807
Tested-by: Impala Public Jenkins
Reviewed-by: Matthew Jacobs <mj...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/a0ec5193
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/a0ec5193
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/a0ec5193

Branch: refs/heads/master
Commit: a0ec519362ce679485026909a6ef7a8d7dfc355d
Parents: 59dfa1f
Author: John Russell <jr...@cloudera.com>
Authored: Thu Jan 26 17:12:28 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Sat Jan 28 01:06:19 2017 +0000

----------------------------------------------------------------------
 docs/topics/impala_known_issues.xml | 39 ++++++++++++++++----------------
 1 file changed, 20 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a0ec5193/docs/topics/impala_known_issues.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_known_issues.xml b/docs/topics/impala_known_issues.xml
index b76545c..fee82b8 100644
--- a/docs/topics/impala_known_issues.xml
+++ b/docs/topics/impala_known_issues.xml
@@ -62,7 +62,7 @@ under the License.
       <conbody>
         <p>
         </p>
-        <p><b>Bug:</b> <xref href="https://issues.cloudera.org/browse/" scope="external" format="html"></xref></p>
+        <p><b>Bug:</b> <xref keyref="" scope="external" format="html"></xref></p>
         <p><b>Severity:</b> High</p>
         <p><b>Resolution:</b> </p>
         <p><b>Workaround:</b> </p>
@@ -141,6 +141,25 @@ https://issues.cloudera.org/browse/IMPALA-2144 - Don't have
 
     </conbody>
 
+    <concept id="IMPALA-4828">
+      <title>Altering Kudu table schema outside of Impala may result in crash on read</title>
+      <conbody>
+        <p>
+          Creating a table in Impala, changing the column schema outside of Impala,
+          and then reading again in Impala may result in a crash. Neither Impala nor
+          the Kudu client validates the schema immediately before reading, so Impala may attempt to
+          dereference pointers that aren't there. This happens if a string column is dropped
+          and then a new, non-string column is added with the old string column's name.
+        </p>
+        <p><b>Bug:</b> <xref keyref="IMPALA-4828" scope="external" format="html">IMPALA-4828</xref></p>
+        <p><b>Severity:</b> High</p>
+        <p><b>Workaround:</b> Run the statement <codeph>REFRESH <varname>table_name</varname></codeph>
+          after any occasion when the table structure, such as the number, names, and data types
+          of columns, are modified outside of Impala using the Kudu API.
+        </p>
+      </conbody>
+    </concept>
+
     <concept id="IMPALA-1972" rev="IMPALA-1972">
 
       <title>Queries that take a long time to plan can cause webserver to block other queries</title>
@@ -273,24 +292,6 @@ https://issues.cloudera.org/browse/IMPALA-2144 - Don't have
 
     </conbody>
 
-    <concept id="IMPALA-4106" rev="IMPALA-4106">
-      <title>Use Hive Metastore bulk API for dropping multiple partitions.</title>
-      <conbody>
-        <p>
-          The bulk partition dropping and setting feature of IMPALA-1654 is not as efficient
-          as it could be, because it currently does not use the Hive Metastore bulk API.
-        </p>
-        <p><b>Bug:</b> <xref keyref="IMPALA-4106">IMPALA-4106</xref></p>
-        <p><b>Severity:</b> High</p>
-        <p><b>Workaround:</b> Schedule <codeph>ALTER TABLE</codeph> operations that touch
-          many partitions for times when the table is not undergoing any other DDL operations,
-          and be prepared for the table to be locked for some time while the <codeph>ALTER TABLE</codeph>
-          is in progress. Test the performance of large-scale partition operations in a development
-          environment before trying on tables in a production system.
-        </p>
-      </conbody>
-    </concept>
-
     <concept id="IMPALA-1480" rev="IMPALA-1480">
 
 <!-- Not part of Alex's spreadsheet. Spreadsheet has IMPALA-1423 which mentions it's similar to this one but not a duplicate. -->