You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2023/08/17 21:06:24 UTC

[impala] 01/02: IMPALA-12335: [DOCS] Add documentation about the DELETE statement

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 12276c79f9975dc63322138ea56290434a49221d
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
AuthorDate: Tue Aug 15 14:47:03 2023 +0200

    IMPALA-12335: [DOCS] Add documentation about the DELETE statement
    
    IMPALA-11877 added support for the DELETE statement for Iceberg
    tables. This patch documents this feature.
    
    Change-Id: If111a7ecd20bda2d4928332ef2ccd905814cb203
    Reviewed-on: http://gerrit.cloudera.org:8080/20361
    Reviewed-by: Zoltan Borok-Nagy <bo...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 docs/impala_keydefs.ditamap    |  1 +
 docs/topics/impala_delete.xml  | 35 +++++++++++++----------
 docs/topics/impala_iceberg.xml | 63 ++++++++++++++++++++++++++++++++++++++----
 3 files changed, 80 insertions(+), 19 deletions(-)

diff --git a/docs/impala_keydefs.ditamap b/docs/impala_keydefs.ditamap
index 737da1089..601d8208a 100644
--- a/docs/impala_keydefs.ditamap
+++ b/docs/impala_keydefs.ditamap
@@ -10529,6 +10529,7 @@ under the License.
   <keydef href="https://issues.apache.org/jira/browse/IMPALA-9999" scope="external" format="html" keys="IMPALA-9999"/>
 
 <!-- Short form of mapping from Impala release to vendor-specific releases, for use in headings. -->
+  <keydef keys="impala43"><topicmeta><keywords><keyword>Impala 4.3</keyword></keywords></topicmeta></keydef>
   <keydef keys="impala42"><topicmeta><keywords><keyword>Impala 4.2</keyword></keywords></topicmeta></keydef>
   <keydef keys="impala41"><topicmeta><keywords><keyword>Impala 4.1</keyword></keywords></topicmeta></keydef>
   <keydef keys="impala40"><topicmeta><keywords><keyword>Impala 4.0</keyword></keywords></topicmeta></keydef>
diff --git a/docs/topics/impala_delete.xml b/docs/topics/impala_delete.xml
index c8591b6bb..5611e9d42 100644
--- a/docs/topics/impala_delete.xml
+++ b/docs/topics/impala_delete.xml
@@ -27,6 +27,7 @@ under the License.
       <data name="Category" value="Impala"/>
       <data name="Category" value="SQL"/>
       <data name="Category" value="Kudu"/>
+      <data name="Category" value="Iceberg"/>
       <data name="Category" value="ETL"/>
       <data name="Category" value="Ingest"/>
       <data name="Category" value="DML"/>
@@ -39,8 +40,8 @@ under the License.
 
     <p>
       <indexterm audience="hidden">DELETE statement</indexterm>
-      Deletes an arbitrary number of rows from a Kudu table.
-      This statement only works for Impala tables that use the Kudu storage engine.
+      Deletes an arbitrary number of rows from a table.
+      This statement only works for Kudu (from <keyword keyref="impala28"/>) and Iceberg tables (from <keyword keyref="impala43"/>).
     </p>
 
     <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
@@ -61,8 +62,8 @@ DELETE <varname>table_ref</varname> FROM [<varname>joined_table_refs</varname>]
     <p>
       The second form evaluates one or more join clauses, and deletes
       all matching rows from one of the tables. The join clauses can
-      include non-Kudu tables, but the table from which the rows
-      are deleted must be a Kudu table. The <codeph>FROM</codeph>
+      include tables of any kind, but the table from which the rows
+      are deleted must be a Kudu or Iceberg table. The <codeph>FROM</codeph>
       keyword is required in this case, to separate the name of
       the table whose rows are being deleted from the table names
       of the join clauses.
@@ -80,7 +81,7 @@ DELETE <varname>table_ref</varname> FROM [<varname>joined_table_refs</varname>]
       The conditions in the <codeph>WHERE</codeph> clause can refer to
       any combination of primary key columns or other columns. Referring to
       primary key columns in the <codeph>WHERE</codeph> clause is more efficient
-      than referring to non-primary key columns.
+      than referring to non-primary key columns (in case of Kudu tables).
     </p>
 
     <p>
@@ -113,6 +114,11 @@ DELETE <varname>table_ref</varname> FROM [<varname>joined_table_refs</varname>]
       </li>
     </ul>
 
+    <p>
+      Iceberg also allows concurrent DELETE operations, in which case the concurrent DELETEs might
+      remove the same rows. This won't corrupt the table as Iceberg allows such behavior.
+    </p>
+
     <p>
       The number of affected rows is reported in an <cmdname>impala-shell</cmdname> message
       and in the query profile.
@@ -132,16 +138,16 @@ DELETE <varname>table_ref</varname> FROM [<varname>joined_table_refs</varname>]
 
 <codeblock>
 -- Deletes all rows. The FROM keyword is optional.
-DELETE FROM kudu_table;
-DELETE kudu_table;
+DELETE FROM table;
+DELETE table;
 
 -- Deletes 0, 1, or more rows.
 -- (If c1 is a single-column primary key, the statement could only
 -- delete 0 or 1 rows.)
-DELETE FROM kudu_table WHERE c1 = 100;
+DELETE FROM table WHERE c1 = 100;
 
 -- Deletes all rows that match all the WHERE conditions.
-DELETE FROM kudu_table WHERE
+DELETE FROM table WHERE
   (c1 > c2 OR c3 IN ('hello','world')) AND c4 IS NOT NULL;
 DELETE FROM t1 WHERE
   (c1 IN (1,2,3) AND c2 > c3) OR c4 IS NOT NULL;
@@ -153,7 +159,7 @@ DELETE FROM t1 WHERE
   c5 IN (SELECT DISTINCT other_col FROM other_table);
 
 -- Does not delete any rows, because the WHERE condition is always false.
-DELETE FROM kudu_table WHERE 1 = 0;
+DELETE FROM table WHERE 1 = 0;
 </codeblock>
 
     <p>
@@ -169,17 +175,18 @@ DELETE t1 FROM t1 JOIN t2 ON t1.x = t2.x;
 DELETE t1 FROM t1 JOIN t2 ON t1.x = t2.x
   WHERE t1.y = FALSE and t2.z > 100;
 
--- Delete from a Kudu table based on a join with a non-Kudu table.
-DELETE t1 FROM kudu_table t1 JOIN non_kudu_table t2 ON t1.x = t2.x;
+-- Delete from a table based on a join with another table.
+DELETE t1 FROM table t1 JOIN other_table t2 ON t1.x = t2.x;
 
--- The tables can be joined in any order as long as the Kudu table
+-- The tables can be joined in any order as long as the Kudu or Iceberg table
 -- is specified as the deletion target.
-DELETE t2 FROM non_kudu_table t1 JOIN kudu_table t2 ON t1.x = t2.x;
+DELETE t2 FROM non_kudu_non_ice_table t1 JOIN kudu_or_ice_table t2 ON t1.x = t2.x;
 </codeblock>
 
     <p conref="../shared/impala_common.xml#common/related_info"/>
 
     <p>
+      <xref href="impala_iceberg.xml#impala_iceberg"/>,
       <xref href="impala_kudu.xml#impala_kudu"/>, <xref href="impala_insert.xml#insert"/>,
       <xref href="impala_update.xml#update"/>, <xref href="impala_upsert.xml#upsert"/>
     </p>
diff --git a/docs/topics/impala_iceberg.xml b/docs/topics/impala_iceberg.xml
index 1504efcdb..66dd7fd8c 100644
--- a/docs/topics/impala_iceberg.xml
+++ b/docs/topics/impala_iceberg.xml
@@ -43,10 +43,6 @@ under the License.
       Iceberg Catalogs (e.g. HiveCatalog, HadoopCatalog). It also supports location-based
       tables (HadoopTables).
     </p>
-    <p>
-      Currently only Iceberg V1 DML operations are allowed, i.e. INSERT INTO /INSERT OVERWRITE.
-      Iceberg V2 operations like row-level modifications (UPDATE, DELETE) are not supported yet.
-    </p>
 
     <p>
       For more information on Iceberg, see <xref keyref="upstream_iceberg_site"/>.
@@ -214,6 +210,37 @@ CREATE TABLE ice_ctas_part_spec PARTITIONED BY SPEC (truncate(3, s)) STORED AS I
     </conbody>
   </concept>
 
+  <concept id="iceberg_v2">
+    <title>Iceberg V2 tables</title>
+    <conbody>
+      <p>
+        Iceberg V2 tables support row-level modifications (DELETE, UPDATE) via "merge-on-read", which means instead
+        of rewriting existing data files, separate so-called delete files are being written that store information
+        about the deleted records. There are two kinds of delete files in Iceberg:
+        <ul>
+          <li>position deletes</li>
+          <li>equality deletes</li>
+        </ul>
+        Impala only supports position delete files. These files contain the file path and file position of the deleted
+        rows.
+      </p>
+      <p>
+        One can create Iceberg V2 tables via the <codeph>CREATE TABLE</codeph> statement, they just need to specify
+        the 'format-version' table property:
+        <codeblock>
+CREATE TABLE ice_v2 (i int) STORED BY ICEBERG TBLPROPERTIES('format-version'='2');
+        </codeblock>
+      </p>
+      <p>
+        It is also possible to upgrade existing Iceberg V1 tables to Iceberg V2 tables. One can use the following
+        <codeph>ALTER TABLE</codeph> statement to do so:
+        <codeblock>
+ALTER TABLE ice_v1_to_v2 SET TBLPROPERTIES('format-version'='2');
+        </codeblock>
+      </p>
+    </conbody>
+  </concept>
+
   <concept id="iceberg_drop">
     <title>Dropping Iceberg tables</title>
     <conbody>
@@ -418,7 +445,7 @@ CREATE TABLE ice_p (i INT, b INT) PARTITIONED BY (p1 INT, p2 STRING) STORED AS I
   </concept>
 
   <concept id="iceberg_inserts">
-    <title>Writing Iceberg tables</title>
+    <title>Inserting data into Iceberg tables</title>
     <conbody>
       <p>
         Impala is also able to insert new data to Iceberg tables. Currently the <codeph>INSERT INTO</codeph>
@@ -447,6 +474,32 @@ INSERT INTO ice_p VALUES (1, 2);
     </conbody>
   </concept>
 
+  <concept id="iceberg_delete">
+    <title>Delete data from Iceberg tables</title>
+    <conbody>
+      <p>
+        Since <keyword keyref="impala43"/> Impala is able to run <codeph>DELETE</codeph> statements against
+        Iceberg V2 tables. E.g.:
+        <codeblock>
+DELETE FROM ice_t where i = 3;
+        </codeblock>
+      </p>
+      <p>
+        More information about the <codeph>DELETE</codeph> statement can be found at <xref href="impala_delete.xml#delete"/>.
+      </p>
+    </conbody>
+  </concept>
+
+  <concept id="iceberg_update">
+    <title>Updating data int Iceberg tables</title>
+    <conbody>
+      <p>
+        Impala does not yet support the UPDATE statement. One can mimic its behaviour by calling
+        DELETE + INSERT statements one after the other, but please note that this creates two transactions.
+      </p>
+    </conbody>
+  </concept>
+
   <concept id="iceberg_load">
     <title>Loading data into Iceberg tables</title>
     <conbody>