You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2018/07/27 05:01:19 UTC
[4/5] impala git commit: IMPALA-7225: REFRESH..PARTITION shoud not reset partition's num rows

IMPALA-7225: REFRESH..PARTITION shoud not reset partition's num rows

The bug was that HdfsTable#reloadPartition() was not setting the
partition's numRows_ after reloading the partition.

Fix: Move the code to set numRows_ to createPartition() call so that
the callers need not explicitly set it after calling createPartition().
Additionally consolidates other common code in createPartition().

Added a test.

Change-Id: I5886684e26b453c76f331f6e807f813146c6bf3a
Reviewed-on: http://gerrit.cloudera.org:8080/11056
Reviewed-by: Todd Lipcon <to...@apache.org>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/3bbfbb5e
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/3bbfbb5e
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/3bbfbb5e

Branch: refs/heads/master
Commit: 3bbfbb5ed90f67de1fa0c343e6a8c63e796351f6
Parents: 729e1fb
Author: Bharath Vissapragada <bh...@cloudera.com>
Authored: Wed Jul 25 23:52:53 2018 -0700
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Fri Jul 27 00:11:29 2018 +0000

----------------------------------------------------------------------
 .../org/apache/impala/catalog/HdfsTable.java    | 38 +++++++-------------
 tests/metadata/test_refresh_partition.py        | 29 +++++++++++++++
 2 files changed, 42 insertions(+), 25 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/3bbfbb5e/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
index 4ae1f9b..0f30407 100644
--- a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
@@ -852,19 +852,6 @@ public class HdfsTable extends Table implements FeFsTable {
         // If the partition is null, its HDFS path does not exist, and it was not added
         // to this table's partition list. Skip the partition.
         if (partition == null) continue;
-        if (msPartition.getParameters() != null) {
-          partition.setNumRows(FeCatalogUtils.getRowCount(msPartition.getParameters()));
-        }
-        if (!TAccessLevelUtil.impliesWriteAccess(partition.getAccessLevel())) {
-          // TODO: READ_ONLY isn't exactly correct because the it's possible the
-          // partition does not have READ permissions either. When we start checking
-          // whether we can READ from a table, this should be updated to set the
-          // table's access level to the "lowest" effective level across all
-          // partitions. That is, if one partition has READ_ONLY and another has
-          // WRITE_ONLY the table's access level should be NONE.
-          accessLevel_ = TAccessLevel.READ_ONLY;
-        }
-
         Path partDir = FileSystemUtil.createFullyQualifiedPath(
             new Path(msPartition.getSd().getLocation()));
         List<HdfsPartition> parts = partsByPath.get(partDir);
@@ -1096,6 +1083,19 @@ public class HdfsTable extends Table implements FeFsTable {
           new HdfsPartition(this, msPartition, keyValues, fileFormatDescriptor,
           new ArrayList<FileDescriptor>(), getAvailableAccessLevel(fs, partDirPath));
       partition.checkWellFormed();
+      // Set the partition's #rows.
+      if (msPartition != null && msPartition.getParameters() != null) {
+         partition.setNumRows(FeCatalogUtils.getRowCount(msPartition.getParameters()));
+      }
+      if (!TAccessLevelUtil.impliesWriteAccess(partition.getAccessLevel())) {
+          // TODO: READ_ONLY isn't exactly correct because the it's possible the
+          // partition does not have READ permissions either. When we start checking
+          // whether we can READ from a table, this should be updated to set the
+          // table's access level to the "lowest" effective level across all
+          // partitions. That is, if one partition has READ_ONLY and another has
+          // WRITE_ONLY the table's access level should be NONE.
+          accessLevel_ = TAccessLevel.READ_ONLY;
+      }
       return partition;
     } catch (IOException e) {
       throw new CatalogException("Error initializing partition", e);
@@ -1660,18 +1660,6 @@ public class HdfsTable extends Table implements FeFsTable {
       // If the partition is null, its HDFS path does not exist, and it was not added to
       // this table's partition list. Skip the partition.
       if (partition == null) continue;
-      if (msPartition.getParameters() != null) {
-        partition.setNumRows(FeCatalogUtils.getRowCount(msPartition.getParameters()));
-      }
-      if (!TAccessLevelUtil.impliesWriteAccess(partition.getAccessLevel())) {
-        // TODO: READ_ONLY isn't exactly correct because the it's possible the
-        // partition does not have READ permissions either. When we start checking
-        // whether we can READ from a table, this should be updated to set the
-        // table's access level to the "lowest" effective level across all
-        // partitions. That is, if one partition has READ_ONLY and another has
-        // WRITE_ONLY the table's access level should be NONE.
-        accessLevel_ = TAccessLevel.READ_ONLY;
-      }
       refreshPartitionFileMetadata(partition);
     }
   }

http://git-wip-us.apache.org/repos/asf/impala/blob/3bbfbb5e/tests/metadata/test_refresh_partition.py
----------------------------------------------------------------------
diff --git a/tests/metadata/test_refresh_partition.py b/tests/metadata/test_refresh_partition.py
index 187055c..2c31b93 100644
--- a/tests/metadata/test_refresh_partition.py
+++ b/tests/metadata/test_refresh_partition.py
@@ -44,6 +44,35 @@ class TestRefreshPartition(ImpalaTestSuite):
     cls.ImpalaTestMatrix.add_dimension(
         create_uncompressed_text_dimension(cls.get_workload()))
 
+  def test_refresh_partition_num_rows(self, vector, unique_database):
+    """Refreshing a partition should not change it's numRows stat."""
+    # Create a partitioned table and add data to it.
+    tbl = unique_database + ".t1"
+    self.client.execute("create table %s(a int) partitioned by (b int)" % tbl)
+    self.client.execute("insert into %s partition(b=1) values (1)" % tbl)
+    # Compute stats on tbl. It should populate the partition num rows.
+    self.client.execute("compute stats %s" % tbl)
+    result = self.client.execute("show partitions %s" % tbl)
+    # Format: partition/#Rows/#Files (first 3 entries)
+    assert result.get_data().startswith("1\t1\t1"),\
+        "Incorrect partition stats %s" % result.get_data()
+    # Add another file to the same partition using hive.
+    self.run_stmt_in_hive("insert into table %s partition (b=1) values (2)" % tbl)
+    # Make sure Impala still sees a single row.
+    assert "1" == self.client.execute("select count(*) from %s" % tbl).get_data()
+    # refresh the partition and make sure the new row is visible
+    self.client.execute("refresh %s partition (b=1)" % tbl)
+    assert "2" == self.client.execute("select count(*) from %s" % tbl).get_data()
+    # Make sure the partition num rows are unchanged and still 1 but the #files is updated.
+    result = self.client.execute("show partitions %s" % tbl)
+    assert result.get_data().startswith("1\t1\t2"),\
+        "Incorrect partition stats %s" % result.get_data()
+    # Do a full table refresh and it should still remain the same.
+    self.client.execute("refresh %s" % tbl)
+    result = self.client.execute("show partitions %s" % tbl)
+    assert result.get_data().startswith("1\t1\t2"),\
+        "Incorrect partition stats %s" % result.get_data()
+
   def test_add_hive_partition_and_refresh(self, vector, unique_database):
     """
     Partition added in Hive can be viewed in Impala after refreshing