You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2016/05/13 06:09:36 UTC
[05/10] incubator-impala git commit: IMPALA-3459: Add test for DROP
TABLE PURGE for S3
IMPALA-3459: Add test for DROP TABLE PURGE for S3
It was previously thought that PURGE had no effect on S3. However,
the Hive Metastore actually created a .Trash directory and copied the
files there when a DROP TABLE was conducted from Impala.
This patch just enables the existing PURGE tests for S3. There were a
few reasons this wasn't working before. The paths given to the S3
client (boto3) should not have a leading "/". This has been fixed as
it doesn't make a difference for HDFS if that exists or not.
Also, PURGE is a pure delete whereas a regular DROP is a copy. A copy
is consistent whereas a delete is only eventually consistent, so when
we PURGE a table or partition, the files will still be visible for
sometime after the query has completed. The tests have been modified
to accomodate for this case as well.
Change-Id: I52d2451e090b00ae2fd9a879c28defa6c940047c
Reviewed-on: http://gerrit.cloudera.org:8080/3036
Reviewed-by: Sailesh Mukil <sa...@cloudera.com>
Tested-by: Internal Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/7e0cbaf1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/7e0cbaf1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/7e0cbaf1
Branch: refs/heads/master
Commit: 7e0cbaf1a06da075639b36290b1ec09ef82122e0
Parents: 6910f49
Author: Sailesh Mukil <sa...@cloudera.com>
Authored: Wed May 11 18:22:17 2016 -0700
Committer: Tim Armstrong <ta...@cloudera.com>
Committed: Thu May 12 23:06:36 2016 -0700
----------------------------------------------------------------------
tests/common/skip.py | 2 -
tests/metadata/test_ddl.py | 94 +++++++++++++++++++++++------------------
2 files changed, 52 insertions(+), 44 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/7e0cbaf1/tests/common/skip.py
----------------------------------------------------------------------
diff --git a/tests/common/skip.py b/tests/common/skip.py
index 3c4fe27..b2f52ba 100644
--- a/tests/common/skip.py
+++ b/tests/common/skip.py
@@ -34,8 +34,6 @@ class SkipIfS3:
jira = partial(pytest.mark.skipif, IS_S3)
hdfs_encryption = pytest.mark.skipif(IS_S3,
reason="HDFS encryption is not supported with S3")
- hdfs_purge = pytest.mark.skipif(IS_S3,
- reason="PURGE has no effect on S3")
# These ones need test infra work to re-enable.
udfs = pytest.mark.skipif(IS_S3, reason="udas/udfs not copied to S3")
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/7e0cbaf1/tests/metadata/test_ddl.py
----------------------------------------------------------------------
diff --git a/tests/metadata/test_ddl.py b/tests/metadata/test_ddl.py
index 791d68d..0a1900c 100644
--- a/tests/metadata/test_ddl.py
+++ b/tests/metadata/test_ddl.py
@@ -68,7 +68,6 @@ class TestDdlStatements(ImpalaTestSuite):
for dir_ in ['part_data', 't1_tmp1', 't_part_tmp']:
self.filesystem_client.delete_file_dir('test-warehouse/%s' % dir_, recursive=True)
- @SkipIfS3.hdfs_purge
@SkipIfLocal.hdfs_client
@pytest.mark.execute_serially
def test_drop_table_with_purge(self):
@@ -80,41 +79,48 @@ class TestDdlStatements(ImpalaTestSuite):
self.client.execute("create table {0}.t1(i int)".format(DDL_PURGE_DB))
self.client.execute("create table {0}.t2(i int)".format(DDL_PURGE_DB))
# Create sample test data files under the table directories
- self.hdfs_client.create_file("test-warehouse/{0}.db/t1/t1.txt".format(DDL_PURGE_DB),\
- file_data='t1')
- self.hdfs_client.create_file("test-warehouse/{0}.db/t2/t2.txt".format(DDL_PURGE_DB),\
- file_data='t2')
+ self.filesystem_client.create_file("test-warehouse/{0}.db/t1/t1.txt".\
+ format(DDL_PURGE_DB), file_data='t1')
+ self.filesystem_client.create_file("test-warehouse/{0}.db/t2/t2.txt".\
+ format(DDL_PURGE_DB), file_data='t2')
# Drop the table (without purge) and make sure it exists in trash
self.client.execute("drop table {0}.t1".format(DDL_PURGE_DB))
- assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/t1.txt".\
+ assert not self.filesystem_client.exists("test-warehouse/{0}.db/t1/t1.txt".\
format(DDL_PURGE_DB))
- assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/".format(DDL_PURGE_DB))
- assert self.hdfs_client.exists(\
- "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/t1.txt".\
+ assert not self.filesystem_client.exists("test-warehouse/{0}.db/t1/".\
+ format(DDL_PURGE_DB))
+ assert self.filesystem_client.exists(\
+ "user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/t1.txt".\
format(getpass.getuser(), DDL_PURGE_DB))
- assert self.hdfs_client.exists(\
- "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1".\
+ assert self.filesystem_client.exists(\
+ "user/{0}/.Trash/Current/test-warehouse/{1}.db/t1".\
format(getpass.getuser(), DDL_PURGE_DB))
# Drop the table (with purge) and make sure it doesn't exist in trash
self.client.execute("drop table {0}.t2 purge".format(DDL_PURGE_DB))
- assert not self.hdfs_client.exists("test-warehouse/{0}.db/t2/".format(DDL_PURGE_DB))
- assert not self.hdfs_client.exists("test-warehouse/{0}.db/t2/t2.txt".\
- format(DDL_PURGE_DB))
- assert not self.hdfs_client.exists(\
- "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t2/t2.txt".\
+ if not IS_S3:
+ # In S3, deletes are eventual. So even though we dropped the table, the files
+ # belonging to this table will still be visible for some unbounded time. This
+ # happens only with PURGE. A regular DROP TABLE is just a copy of files which is
+ # consistent.
+ assert not self.filesystem_client.exists("test-warehouse/{0}.db/t2/".\
+ format(DDL_PURGE_DB))
+ assert not self.filesystem_client.exists("test-warehouse/{0}.db/t2/t2.txt".\
+ format(DDL_PURGE_DB))
+ assert not self.filesystem_client.exists(\
+ "user/{0}/.Trash/Current/test-warehouse/{1}.db/t2/t2.txt".\
format(getpass.getuser(), DDL_PURGE_DB))
- assert not self.hdfs_client.exists(\
- "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t2".\
+ assert not self.filesystem_client.exists(\
+ "user/{0}/.Trash/Current/test-warehouse/{1}.db/t2".\
format(getpass.getuser(), DDL_PURGE_DB))
# Create an external table t3 and run the same test as above. Make
# sure the data is not deleted
- self.hdfs_client.make_dir("test-warehouse/data_t3/", permission=777)
- self.hdfs_client.create_file("test-warehouse/data_t3/data.txt", file_data='100')
+ self.filesystem_client.make_dir("test-warehouse/data_t3/", permission=777)
+ self.filesystem_client.create_file("test-warehouse/data_t3/data.txt", file_data='100')
self.client.execute("create external table {0}.t3(i int) stored as \
textfile location \'/test-warehouse/data_t3\'" .format(DDL_PURGE_DB))
self.client.execute("drop table {0}.t3 purge".format(DDL_PURGE_DB))
- assert self.hdfs_client.exists("test-warehouse/data_t3/data.txt")
- self.hdfs_client.delete_file_dir("test-warehouse/data_t3", recursive=True)
+ assert self.filesystem_client.exists("test-warehouse/data_t3/data.txt")
+ self.filesystem_client.delete_file_dir("test-warehouse/data_t3", recursive=True)
@SkipIfLocal.hdfs_client
@pytest.mark.execute_serially
@@ -306,7 +312,6 @@ class TestDdlStatements(ImpalaTestSuite):
self.run_test_case('QueryTest/alter-table', vector, use_db='alter_table_test_db',
multiple_impalad=self._use_multiple_impalad(vector))
- @SkipIfS3.hdfs_purge # S3: missing coverage: alter table drop partition
@SkipIfLocal.hdfs_client
@pytest.mark.execute_serially
def test_drop_partition_with_purge(self, vector):
@@ -315,38 +320,43 @@ class TestDdlStatements(ImpalaTestSuite):
# Create a sample database alter_purge_db and table t1 in it
self._create_db(ALTER_PURGE_DB)
self.client.execute("create table {0}.t1(i int) partitioned\
- by (j int)".format(ALTER_PURGE_DB))
+ by (j int)".format(ALTER_PURGE_DB))
# Add two partitions (j=1) and (j=2) to table t1
self.client.execute("alter table {0}.t1 add partition(j=1)".format(ALTER_PURGE_DB))
self.client.execute("alter table {0}.t1 add partition(j=2)".format(ALTER_PURGE_DB))
- self.hdfs_client.create_file(\
- "test-warehouse/{0}.db/t1/j=1/j1.txt".format(ALTER_PURGE_DB), file_data='j1')
- self.hdfs_client.create_file(\
- "test-warehouse/{0}.db/t1/j=2/j2.txt".format(ALTER_PURGE_DB), file_data='j2')
+ self.filesystem_client.create_file(\
+ "test-warehouse/{0}.db/t1/j=1/j1.txt".format(ALTER_PURGE_DB), file_data='j1')
+ self.filesystem_client.create_file(\
+ "test-warehouse/{0}.db/t1/j=2/j2.txt".format(ALTER_PURGE_DB), file_data='j2')
# Drop the partition (j=1) without purge and make sure it exists in trash
self.client.execute("alter table {0}.t1 drop partition(j=1)".format(ALTER_PURGE_DB));
- assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1/j1.txt".\
+ assert not self.filesystem_client.exists("test-warehouse/{0}.db/t1/j=1/j1.txt".\
format(ALTER_PURGE_DB))
- assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1".\
+ assert not self.filesystem_client.exists("test-warehouse/{0}.db/t1/j=1".\
format(ALTER_PURGE_DB))
- assert self.hdfs_client.exists(\
- "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=1/j1.txt".\
+ assert self.filesystem_client.exists(\
+ "user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=1/j1.txt".\
format(getpass.getuser(), ALTER_PURGE_DB))
- assert self.hdfs_client.exists(\
- "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=1".\
+ assert self.filesystem_client.exists(\
+ "user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=1".\
format(getpass.getuser(), ALTER_PURGE_DB))
# Drop the partition (with purge) and make sure it doesn't exist in trash
self.client.execute("alter table {0}.t1 drop partition(j=2) purge".\
format(ALTER_PURGE_DB));
- assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=2/j2.txt".\
- format(ALTER_PURGE_DB))
- assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=2".\
- format(ALTER_PURGE_DB))
- assert not self.hdfs_client.exists(\
- "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=2".\
+ if not IS_S3:
+ # In S3, deletes are eventual. So even though we dropped the partition, the files
+ # belonging to this partition will still be visible for some unbounded time. This
+ # happens only with PURGE. A regular DROP TABLE is just a copy of files which is
+ # consistent.
+ assert not self.filesystem_client.exists("test-warehouse/{0}.db/t1/j=2/j2.txt".\
+ format(ALTER_PURGE_DB))
+ assert not self.filesystem_client.exists("test-warehouse/{0}.db/t1/j=2".\
+ format(ALTER_PURGE_DB))
+ assert not self.filesystem_client.exists(\
+ "user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=2".\
format(getpass.getuser(), ALTER_PURGE_DB))
- assert not self.hdfs_client.exists(
- "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=2/j2.txt".\
+ assert not self.filesystem_client.exists(
+ "user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=2/j2.txt".\
format(getpass.getuser(), ALTER_PURGE_DB))
@pytest.mark.execute_serially