You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2018/07/27 05:01:19 UTC
[4/5] impala git commit: IMPALA-7225: REFRESH..PARTITION shoud not
reset partition's num rows
IMPALA-7225: REFRESH..PARTITION shoud not reset partition's num rows
The bug was that HdfsTable#reloadPartition() was not setting the
partition's numRows_ after reloading the partition.
Fix: Move the code to set numRows_ to createPartition() call so that
the callers need not explicitly set it after calling createPartition().
Additionally consolidates other common code in createPartition().
Added a test.
Change-Id: I5886684e26b453c76f331f6e807f813146c6bf3a
Reviewed-on: http://gerrit.cloudera.org:8080/11056
Reviewed-by: Todd Lipcon <to...@apache.org>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/3bbfbb5e
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/3bbfbb5e
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/3bbfbb5e
Branch: refs/heads/master
Commit: 3bbfbb5ed90f67de1fa0c343e6a8c63e796351f6
Parents: 729e1fb
Author: Bharath Vissapragada <bh...@cloudera.com>
Authored: Wed Jul 25 23:52:53 2018 -0700
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Fri Jul 27 00:11:29 2018 +0000
----------------------------------------------------------------------
.../org/apache/impala/catalog/HdfsTable.java | 38 +++++++-------------
tests/metadata/test_refresh_partition.py | 29 +++++++++++++++
2 files changed, 42 insertions(+), 25 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/impala/blob/3bbfbb5e/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
index 4ae1f9b..0f30407 100644
--- a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
@@ -852,19 +852,6 @@ public class HdfsTable extends Table implements FeFsTable {
// If the partition is null, its HDFS path does not exist, and it was not added
// to this table's partition list. Skip the partition.
if (partition == null) continue;
- if (msPartition.getParameters() != null) {
- partition.setNumRows(FeCatalogUtils.getRowCount(msPartition.getParameters()));
- }
- if (!TAccessLevelUtil.impliesWriteAccess(partition.getAccessLevel())) {
- // TODO: READ_ONLY isn't exactly correct because the it's possible the
- // partition does not have READ permissions either. When we start checking
- // whether we can READ from a table, this should be updated to set the
- // table's access level to the "lowest" effective level across all
- // partitions. That is, if one partition has READ_ONLY and another has
- // WRITE_ONLY the table's access level should be NONE.
- accessLevel_ = TAccessLevel.READ_ONLY;
- }
-
Path partDir = FileSystemUtil.createFullyQualifiedPath(
new Path(msPartition.getSd().getLocation()));
List<HdfsPartition> parts = partsByPath.get(partDir);
@@ -1096,6 +1083,19 @@ public class HdfsTable extends Table implements FeFsTable {
new HdfsPartition(this, msPartition, keyValues, fileFormatDescriptor,
new ArrayList<FileDescriptor>(), getAvailableAccessLevel(fs, partDirPath));
partition.checkWellFormed();
+ // Set the partition's #rows.
+ if (msPartition != null && msPartition.getParameters() != null) {
+ partition.setNumRows(FeCatalogUtils.getRowCount(msPartition.getParameters()));
+ }
+ if (!TAccessLevelUtil.impliesWriteAccess(partition.getAccessLevel())) {
+ // TODO: READ_ONLY isn't exactly correct because the it's possible the
+ // partition does not have READ permissions either. When we start checking
+ // whether we can READ from a table, this should be updated to set the
+ // table's access level to the "lowest" effective level across all
+ // partitions. That is, if one partition has READ_ONLY and another has
+ // WRITE_ONLY the table's access level should be NONE.
+ accessLevel_ = TAccessLevel.READ_ONLY;
+ }
return partition;
} catch (IOException e) {
throw new CatalogException("Error initializing partition", e);
@@ -1660,18 +1660,6 @@ public class HdfsTable extends Table implements FeFsTable {
// If the partition is null, its HDFS path does not exist, and it was not added to
// this table's partition list. Skip the partition.
if (partition == null) continue;
- if (msPartition.getParameters() != null) {
- partition.setNumRows(FeCatalogUtils.getRowCount(msPartition.getParameters()));
- }
- if (!TAccessLevelUtil.impliesWriteAccess(partition.getAccessLevel())) {
- // TODO: READ_ONLY isn't exactly correct because the it's possible the
- // partition does not have READ permissions either. When we start checking
- // whether we can READ from a table, this should be updated to set the
- // table's access level to the "lowest" effective level across all
- // partitions. That is, if one partition has READ_ONLY and another has
- // WRITE_ONLY the table's access level should be NONE.
- accessLevel_ = TAccessLevel.READ_ONLY;
- }
refreshPartitionFileMetadata(partition);
}
}
http://git-wip-us.apache.org/repos/asf/impala/blob/3bbfbb5e/tests/metadata/test_refresh_partition.py
----------------------------------------------------------------------
diff --git a/tests/metadata/test_refresh_partition.py b/tests/metadata/test_refresh_partition.py
index 187055c..2c31b93 100644
--- a/tests/metadata/test_refresh_partition.py
+++ b/tests/metadata/test_refresh_partition.py
@@ -44,6 +44,35 @@ class TestRefreshPartition(ImpalaTestSuite):
cls.ImpalaTestMatrix.add_dimension(
create_uncompressed_text_dimension(cls.get_workload()))
+ def test_refresh_partition_num_rows(self, vector, unique_database):
+ """Refreshing a partition should not change it's numRows stat."""
+ # Create a partitioned table and add data to it.
+ tbl = unique_database + ".t1"
+ self.client.execute("create table %s(a int) partitioned by (b int)" % tbl)
+ self.client.execute("insert into %s partition(b=1) values (1)" % tbl)
+ # Compute stats on tbl. It should populate the partition num rows.
+ self.client.execute("compute stats %s" % tbl)
+ result = self.client.execute("show partitions %s" % tbl)
+ # Format: partition/#Rows/#Files (first 3 entries)
+ assert result.get_data().startswith("1\t1\t1"),\
+ "Incorrect partition stats %s" % result.get_data()
+ # Add another file to the same partition using hive.
+ self.run_stmt_in_hive("insert into table %s partition (b=1) values (2)" % tbl)
+ # Make sure Impala still sees a single row.
+ assert "1" == self.client.execute("select count(*) from %s" % tbl).get_data()
+ # refresh the partition and make sure the new row is visible
+ self.client.execute("refresh %s partition (b=1)" % tbl)
+ assert "2" == self.client.execute("select count(*) from %s" % tbl).get_data()
+ # Make sure the partition num rows are unchanged and still 1 but the #files is updated.
+ result = self.client.execute("show partitions %s" % tbl)
+ assert result.get_data().startswith("1\t1\t2"),\
+ "Incorrect partition stats %s" % result.get_data()
+ # Do a full table refresh and it should still remain the same.
+ self.client.execute("refresh %s" % tbl)
+ result = self.client.execute("show partitions %s" % tbl)
+ assert result.get_data().startswith("1\t1\t2"),\
+ "Incorrect partition stats %s" % result.get_data()
+
def test_add_hive_partition_and_refresh(self, vector, unique_database):
"""
Partition added in Hive can be viewed in Impala after refreshing