You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2016/05/13 06:09:36 UTC

[05/10] incubator-impala git commit: IMPALA-3459: Add test for DROP TABLE PURGE for S3

IMPALA-3459: Add test for DROP TABLE PURGE for S3

It was previously thought that PURGE had no effect on S3. However,
the Hive Metastore actually created a .Trash directory and copied the
files there when a DROP TABLE was conducted from Impala.

This patch just enables the existing PURGE tests for S3. There were a
few reasons this wasn't working before. The paths given to the S3
client (boto3) should not have a leading "/". This has been fixed as
it doesn't make a difference for HDFS if that exists or not.

Also, PURGE is a pure delete whereas a regular DROP is a copy. A copy
is consistent whereas a delete is only eventually consistent, so when
we PURGE a table or partition, the files will still be visible for
sometime after the query has completed. The tests have been modified
to accomodate for this case as well.

Change-Id: I52d2451e090b00ae2fd9a879c28defa6c940047c
Reviewed-on: http://gerrit.cloudera.org:8080/3036
Reviewed-by: Sailesh Mukil <sa...@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/7e0cbaf1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/7e0cbaf1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/7e0cbaf1

Branch: refs/heads/master
Commit: 7e0cbaf1a06da075639b36290b1ec09ef82122e0
Parents: 6910f49
Author: Sailesh Mukil <sa...@cloudera.com>
Authored: Wed May 11 18:22:17 2016 -0700
Committer: Tim Armstrong <ta...@cloudera.com>
Committed: Thu May 12 23:06:36 2016 -0700

----------------------------------------------------------------------
 tests/common/skip.py       |  2 -
 tests/metadata/test_ddl.py | 94 +++++++++++++++++++++++------------------
 2 files changed, 52 insertions(+), 44 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/7e0cbaf1/tests/common/skip.py
----------------------------------------------------------------------
diff --git a/tests/common/skip.py b/tests/common/skip.py
index 3c4fe27..b2f52ba 100644
--- a/tests/common/skip.py
+++ b/tests/common/skip.py
@@ -34,8 +34,6 @@ class SkipIfS3:
   jira = partial(pytest.mark.skipif, IS_S3)
   hdfs_encryption = pytest.mark.skipif(IS_S3,
       reason="HDFS encryption is not supported with S3")
-  hdfs_purge = pytest.mark.skipif(IS_S3,
-      reason="PURGE has no effect on S3")
 
   # These ones need test infra work to re-enable.
   udfs = pytest.mark.skipif(IS_S3, reason="udas/udfs not copied to S3")

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/7e0cbaf1/tests/metadata/test_ddl.py
----------------------------------------------------------------------
diff --git a/tests/metadata/test_ddl.py b/tests/metadata/test_ddl.py
index 791d68d..0a1900c 100644
--- a/tests/metadata/test_ddl.py
+++ b/tests/metadata/test_ddl.py
@@ -68,7 +68,6 @@ class TestDdlStatements(ImpalaTestSuite):
     for dir_ in ['part_data', 't1_tmp1', 't_part_tmp']:
       self.filesystem_client.delete_file_dir('test-warehouse/%s' % dir_, recursive=True)
 
-  @SkipIfS3.hdfs_purge
   @SkipIfLocal.hdfs_client
   @pytest.mark.execute_serially
   def test_drop_table_with_purge(self):
@@ -80,41 +79,48 @@ class TestDdlStatements(ImpalaTestSuite):
     self.client.execute("create table {0}.t1(i int)".format(DDL_PURGE_DB))
     self.client.execute("create table {0}.t2(i int)".format(DDL_PURGE_DB))
     # Create sample test data files under the table directories
-    self.hdfs_client.create_file("test-warehouse/{0}.db/t1/t1.txt".format(DDL_PURGE_DB),\
-        file_data='t1')
-    self.hdfs_client.create_file("test-warehouse/{0}.db/t2/t2.txt".format(DDL_PURGE_DB),\
-        file_data='t2')
+    self.filesystem_client.create_file("test-warehouse/{0}.db/t1/t1.txt".\
+        format(DDL_PURGE_DB), file_data='t1')
+    self.filesystem_client.create_file("test-warehouse/{0}.db/t2/t2.txt".\
+        format(DDL_PURGE_DB), file_data='t2')
     # Drop the table (without purge) and make sure it exists in trash
     self.client.execute("drop table {0}.t1".format(DDL_PURGE_DB))
-    assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/t1.txt".\
+    assert not self.filesystem_client.exists("test-warehouse/{0}.db/t1/t1.txt".\
         format(DDL_PURGE_DB))
-    assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/".format(DDL_PURGE_DB))
-    assert self.hdfs_client.exists(\
-        "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/t1.txt".\
+    assert not self.filesystem_client.exists("test-warehouse/{0}.db/t1/".\
+        format(DDL_PURGE_DB))
+    assert self.filesystem_client.exists(\
+        "user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/t1.txt".\
         format(getpass.getuser(), DDL_PURGE_DB))
-    assert self.hdfs_client.exists(\
-        "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1".\
+    assert self.filesystem_client.exists(\
+        "user/{0}/.Trash/Current/test-warehouse/{1}.db/t1".\
         format(getpass.getuser(), DDL_PURGE_DB))
     # Drop the table (with purge) and make sure it doesn't exist in trash
     self.client.execute("drop table {0}.t2 purge".format(DDL_PURGE_DB))
-    assert not self.hdfs_client.exists("test-warehouse/{0}.db/t2/".format(DDL_PURGE_DB))
-    assert not self.hdfs_client.exists("test-warehouse/{0}.db/t2/t2.txt".\
-        format(DDL_PURGE_DB))
-    assert not self.hdfs_client.exists(\
-        "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t2/t2.txt".\
+    if not IS_S3:
+      # In S3, deletes are eventual. So even though we dropped the table, the files
+      # belonging to this table will still be visible for some unbounded time. This
+      # happens only with PURGE. A regular DROP TABLE is just a copy of files which is
+      # consistent.
+      assert not self.filesystem_client.exists("test-warehouse/{0}.db/t2/".\
+          format(DDL_PURGE_DB))
+      assert not self.filesystem_client.exists("test-warehouse/{0}.db/t2/t2.txt".\
+          format(DDL_PURGE_DB))
+    assert not self.filesystem_client.exists(\
+        "user/{0}/.Trash/Current/test-warehouse/{1}.db/t2/t2.txt".\
         format(getpass.getuser(), DDL_PURGE_DB))
-    assert not self.hdfs_client.exists(\
-        "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t2".\
+    assert not self.filesystem_client.exists(\
+        "user/{0}/.Trash/Current/test-warehouse/{1}.db/t2".\
         format(getpass.getuser(), DDL_PURGE_DB))
     # Create an external table t3 and run the same test as above. Make
     # sure the data is not deleted
-    self.hdfs_client.make_dir("test-warehouse/data_t3/", permission=777)
-    self.hdfs_client.create_file("test-warehouse/data_t3/data.txt", file_data='100')
+    self.filesystem_client.make_dir("test-warehouse/data_t3/", permission=777)
+    self.filesystem_client.create_file("test-warehouse/data_t3/data.txt", file_data='100')
     self.client.execute("create external table {0}.t3(i int) stored as \
       textfile location \'/test-warehouse/data_t3\'" .format(DDL_PURGE_DB))
     self.client.execute("drop table {0}.t3 purge".format(DDL_PURGE_DB))
-    assert self.hdfs_client.exists("test-warehouse/data_t3/data.txt")
-    self.hdfs_client.delete_file_dir("test-warehouse/data_t3", recursive=True)
+    assert self.filesystem_client.exists("test-warehouse/data_t3/data.txt")
+    self.filesystem_client.delete_file_dir("test-warehouse/data_t3", recursive=True)
 
   @SkipIfLocal.hdfs_client
   @pytest.mark.execute_serially
@@ -306,7 +312,6 @@ class TestDdlStatements(ImpalaTestSuite):
     self.run_test_case('QueryTest/alter-table', vector, use_db='alter_table_test_db',
         multiple_impalad=self._use_multiple_impalad(vector))
 
-  @SkipIfS3.hdfs_purge # S3: missing coverage: alter table drop partition
   @SkipIfLocal.hdfs_client
   @pytest.mark.execute_serially
   def test_drop_partition_with_purge(self, vector):
@@ -315,38 +320,43 @@ class TestDdlStatements(ImpalaTestSuite):
     # Create a sample database alter_purge_db and table t1 in it
     self._create_db(ALTER_PURGE_DB)
     self.client.execute("create table {0}.t1(i int) partitioned\
-      by (j int)".format(ALTER_PURGE_DB))
+        by (j int)".format(ALTER_PURGE_DB))
     # Add two partitions (j=1) and (j=2) to table t1
     self.client.execute("alter table {0}.t1 add partition(j=1)".format(ALTER_PURGE_DB))
     self.client.execute("alter table {0}.t1 add partition(j=2)".format(ALTER_PURGE_DB))
-    self.hdfs_client.create_file(\
-            "test-warehouse/{0}.db/t1/j=1/j1.txt".format(ALTER_PURGE_DB), file_data='j1')
-    self.hdfs_client.create_file(\
-            "test-warehouse/{0}.db/t1/j=2/j2.txt".format(ALTER_PURGE_DB), file_data='j2')
+    self.filesystem_client.create_file(\
+        "test-warehouse/{0}.db/t1/j=1/j1.txt".format(ALTER_PURGE_DB), file_data='j1')
+    self.filesystem_client.create_file(\
+        "test-warehouse/{0}.db/t1/j=2/j2.txt".format(ALTER_PURGE_DB), file_data='j2')
     # Drop the partition (j=1) without purge and make sure it exists in trash
     self.client.execute("alter table {0}.t1 drop partition(j=1)".format(ALTER_PURGE_DB));
-    assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1/j1.txt".\
+    assert not self.filesystem_client.exists("test-warehouse/{0}.db/t1/j=1/j1.txt".\
         format(ALTER_PURGE_DB))
-    assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1".\
+    assert not self.filesystem_client.exists("test-warehouse/{0}.db/t1/j=1".\
         format(ALTER_PURGE_DB))
-    assert self.hdfs_client.exists(\
-        "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=1/j1.txt".\
+    assert self.filesystem_client.exists(\
+        "user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=1/j1.txt".\
         format(getpass.getuser(), ALTER_PURGE_DB))
-    assert self.hdfs_client.exists(\
-        "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=1".\
+    assert self.filesystem_client.exists(\
+        "user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=1".\
         format(getpass.getuser(), ALTER_PURGE_DB))
     # Drop the partition (with purge) and make sure it doesn't exist in trash
     self.client.execute("alter table {0}.t1 drop partition(j=2) purge".\
         format(ALTER_PURGE_DB));
-    assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=2/j2.txt".\
-        format(ALTER_PURGE_DB))
-    assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=2".\
-        format(ALTER_PURGE_DB))
-    assert not self.hdfs_client.exists(\
-        "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=2".\
+    if not IS_S3:
+      # In S3, deletes are eventual. So even though we dropped the partition, the files
+      # belonging to this partition will still be visible for some unbounded time. This
+      # happens only with PURGE. A regular DROP TABLE is just a copy of files which is
+      # consistent.
+      assert not self.filesystem_client.exists("test-warehouse/{0}.db/t1/j=2/j2.txt".\
+          format(ALTER_PURGE_DB))
+      assert not self.filesystem_client.exists("test-warehouse/{0}.db/t1/j=2".\
+          format(ALTER_PURGE_DB))
+    assert not self.filesystem_client.exists(\
+        "user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=2".\
         format(getpass.getuser(), ALTER_PURGE_DB))
-    assert not self.hdfs_client.exists(
-        "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=2/j2.txt".\
+    assert not self.filesystem_client.exists(
+        "user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=2/j2.txt".\
         format(getpass.getuser(), ALTER_PURGE_DB))
 
   @pytest.mark.execute_serially