You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jr...@apache.org on 2017/03/14 03:33:45 UTC
[1/2] incubator-impala git commit: IMPALA-4762: RECOVER PARTITIONS
should batch partition updates
Repository: incubator-impala
Updated Branches:
refs/heads/master d9602df71 -> e1b092de6
IMPALA-4762: RECOVER PARTITIONS should batch partition updates
Batch updates when doing a RECOVER PARTITIONS on over 500
partitions at a time to avoid HMS timeouts, possible OOM.
Testing: Expanded test coverage with a new python test
for this case. Test takes ~18s to run.
Change-Id: I7f9334051b11ba8fa16159b7ca67ddc7f2392733
Reviewed-on: http://gerrit.cloudera.org:8080/6275
Reviewed-by: Dimitris Tsirogiannis <dt...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/eec8d6fd
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/eec8d6fd
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/eec8d6fd
Branch: refs/heads/master
Commit: eec8d6fd15dc9e914a773aab1390f95d97f515ae
Parents: d9602df
Author: Zach Amsden <za...@cloudera.com>
Authored: Mon Mar 6 23:43:45 2017 +0000
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Tue Mar 14 01:38:59 2017 +0000
----------------------------------------------------------------------
.../impala/service/CatalogOpExecutor.java | 52 ++++++++++----------
tests/metadata/test_recover_partitions.py | 31 ++++++++++++
2 files changed, 58 insertions(+), 25 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/eec8d6fd/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java b/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
index 9d82c07..5011769 100644
--- a/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
+++ b/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
@@ -2633,31 +2633,33 @@ public class CatalogOpExecutor {
// Add partitions to metastore.
try (MetaStoreClient msClient = catalog_.getMetaStoreClient()) {
- // ifNotExists and needResults are true.
- hmsPartitions = msClient.getHiveClient().add_partitions(hmsPartitions,
- true, true);
- for (Partition partition: hmsPartitions) {
- // Create and add the HdfsPartition. Return the table object with an updated
- // catalog version.
- addHdfsPartition(tbl, partition);
- }
+ // Apply the updates in batches of 'MAX_PARTITION_UPDATES_PER_RPC'.
+ for (int i = 0; i < hmsPartitions.size(); i += MAX_PARTITION_UPDATES_PER_RPC) {
+ int endPartitionIndex =
+ Math.min(i + MAX_PARTITION_UPDATES_PER_RPC, hmsPartitions.size());
+ List<Partition> hmsSublist = hmsPartitions.subList(i, endPartitionIndex);
+ // ifNotExists and needResults are true.
+ List<Partition> hmsAddedPartitions =
+ msClient.getHiveClient().add_partitions(hmsSublist, true, true);
+ for (Partition partition: hmsAddedPartitions) {
+ // Create and add the HdfsPartition. Return the table object with an updated
+ // catalog version.
+ addHdfsPartition(tbl, partition);
+ }
- // Handle HDFS cache.
- if (cachePoolName != null) {
- for (Partition partition: hmsPartitions) {
- long id = HdfsCachingUtil.submitCachePartitionDirective(partition,
- cachePoolName, replication);
- cacheIds.add(id);
+ // Handle HDFS cache.
+ if (cachePoolName != null) {
+ for (Partition partition: hmsAddedPartitions) {
+ long id = HdfsCachingUtil.submitCachePartitionDirective(partition,
+ cachePoolName, replication);
+ cacheIds.add(id);
+ }
+ // Update the partition metadata to include the cache directive id.
+ msClient.getHiveClient().alter_partitions(tableName.getDb(),
+ tableName.getTbl(), hmsAddedPartitions);
}
- // Update the partition metadata to include the cache directive id.
- msClient.getHiveClient().alter_partitions(tableName.getDb(),
- tableName.getTbl(), hmsPartitions);
+ updateLastDdlTime(msTbl, msClient);
}
- updateLastDdlTime(msTbl, msClient);
- } catch (AlreadyExistsException e) {
- // This may happen when another client of HMS has added the partitions.
- LOG.trace(String.format("Ignoring '%s' when adding partition to %s.", e,
- tableName));
} catch (TException e) {
throw new ImpalaRuntimeException(
String.format(HMS_RPC_ERROR_FORMAT_STR, "add_partition"), e);
@@ -2956,15 +2958,15 @@ public class CatalogOpExecutor {
try (MetaStoreClient msClient = catalog_.getMetaStoreClient()) {
// Apply the updates in batches of 'MAX_PARTITION_UPDATES_PER_RPC'.
for (int i = 0; i < hmsPartitions.size(); i += MAX_PARTITION_UPDATES_PER_RPC) {
- int numPartitionsToUpdate =
+ int endPartitionIndex =
Math.min(i + MAX_PARTITION_UPDATES_PER_RPC, hmsPartitions.size());
try {
// Alter partitions in bulk.
msClient.getHiveClient().alter_partitions(dbName, tableName,
- hmsPartitions.subList(i, numPartitionsToUpdate));
+ hmsPartitions.subList(i, endPartitionIndex));
// Mark the corresponding HdfsPartition objects as dirty
for (org.apache.hadoop.hive.metastore.api.Partition msPartition:
- hmsPartitions.subList(i, numPartitionsToUpdate)) {
+ hmsPartitions.subList(i, endPartitionIndex)) {
try {
catalog_.getHdfsPartition(dbName, tableName, msPartition).markDirty();
} catch (PartitionNotFoundException e) {
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/eec8d6fd/tests/metadata/test_recover_partitions.py
----------------------------------------------------------------------
diff --git a/tests/metadata/test_recover_partitions.py b/tests/metadata/test_recover_partitions.py
index cb0cf98..58a7219 100644
--- a/tests/metadata/test_recover_partitions.py
+++ b/tests/metadata/test_recover_partitions.py
@@ -168,6 +168,37 @@ class TestRecoverPartitions(ImpalaTestSuite):
assert self.has_value(INSERTED_VALUE, result.data)
@SkipIfLocal.hdfs_client
+ def test_recover_many_partitions(self, vector, unique_database):
+ """Test that RECOVER PARTITIONS correctly discovers new partitions added externally
+ by the hdfs client, recovered in batches"""
+
+ TBL_NAME = "test_recover_partitions"
+ FQ_TBL_NAME = unique_database + "." + TBL_NAME
+ TBL_LOCATION = self.__get_fs_location(unique_database, TBL_NAME)
+
+ self.execute_query_expect_success(self.client,
+ "CREATE TABLE %s (c int) PARTITIONED BY (s string)" % (FQ_TBL_NAME))
+
+ # Create 700 partitions externally
+ for i in xrange(1, 700):
+ PART_DIR = "s=part%d/" % i
+ FILE_PATH = "test"
+ self.filesystem_client.make_dir(TBL_LOCATION + PART_DIR)
+
+ result = self.execute_query_expect_success(self.client,
+ "SHOW PARTITIONS %s" % FQ_TBL_NAME)
+ for i in xrange(1, 700):
+ PART_DIR = "part%d\t" % i
+ assert not self.has_value(PART_DIR, result.data)
+ self.execute_query_expect_success(self.client,
+ "ALTER TABLE %s RECOVER PARTITIONS" % FQ_TBL_NAME)
+ result = self.execute_query_expect_success(self.client,
+ "SHOW PARTITIONS %s" % FQ_TBL_NAME)
+ for i in xrange(1, 700):
+ PART_DIR = "part%d\t" % i
+ assert self.has_value(PART_DIR, result.data)
+
+ @SkipIfLocal.hdfs_client
def test_duplicate_partitions(self, vector, unique_database):
"""Test that RECOVER PARTITIONS does not recover equivalent partitions. Two partitions
are considered equivalent if they correspond to distinct paths but can be converted
[2/2] incubator-impala git commit: [DOCS] Fix build error post merge
conflict
Posted by jr...@apache.org.
[DOCS] Fix build error post merge conflict
The verification job succeeded in merging gerrit #6345
despite the presence of a build error. Fixing the build
error in a separate gerrit because can't continue #6345
once it's merged.
Change-Id: I9bf8d11e06a68103f6760db72cedf372363f75f0
Reviewed-on: http://gerrit.cloudera.org:8080/6373
Reviewed-by: Ambreen Kazi <am...@cloudera.com>
Reviewed-by: John Russell <jr...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/e1b092de
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/e1b092de
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/e1b092de
Branch: refs/heads/master
Commit: e1b092de683b4d2e67c90deffd2da7777631f3e1
Parents: eec8d6f
Author: John Russell <jr...@cloudera.com>
Authored: Mon Mar 13 17:06:02 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Tue Mar 14 03:26:24 2017 +0000
----------------------------------------------------------------------
docs/impala_keydefs.ditamap | 2 ++
1 file changed, 2 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/e1b092de/docs/impala_keydefs.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala_keydefs.ditamap b/docs/impala_keydefs.ditamap
index 7a85fd7..0c41dcc 100644
--- a/docs/impala_keydefs.ditamap
+++ b/docs/impala_keydefs.ditamap
@@ -163,6 +163,8 @@ under the License.
<topicmeta><linktext>the documentation for your Apache Hadoop distribution</linktext></topicmeta>
</keydef>
<keydef href="" scope="external" format="html" keys="setup_hdfs_caching">
+ <topicmeta><linktext>the documentation for your Apache Hadoop distribution</linktext></topicmeta>
+ </keydef>
<keydef href="" scope="external" format="html" keys="cdh_hag_cdh_other_ha">
<topicmeta><linktext>the documentation for your Apache Hadoop distribution</linktext></topicmeta>
</keydef>