You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by yi...@apache.org on 2023/03/14 18:56:52 UTC

[hudi] branch master updated: [HUDI-5925] Improve bootstrap parallelism (#8170)

This is an automated email from the ASF dual-hosted git repository.

yihua pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
     new 17fdd10fdc3 [HUDI-5925] Improve bootstrap parallelism (#8170)
17fdd10fdc3 is described below

commit 17fdd10fdc3ef54aaf6286623a52d97d1562c9df
Author: Y Ethan Guo <et...@gmail.com>
AuthorDate: Tue Mar 14 11:56:45 2023 -0700

    [HUDI-5925] Improve bootstrap parallelism (#8170)
---
 .../java/org/apache/hudi/config/HoodieBootstrapConfig.java     | 10 +++++++++-
 .../action/bootstrap/SparkBootstrapCommitActionExecutor.java   |  5 +++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieBootstrapConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieBootstrapConfig.java
index c7bb3d3901f..a38e72906f7 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieBootstrapConfig.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieBootstrapConfig.java
@@ -96,7 +96,15 @@ public class HoodieBootstrapConfig extends HoodieConfig {
       .key("hoodie.bootstrap.parallelism")
       .defaultValue("1500")
       .sinceVersion("0.6.0")
-      .withDocumentation("Parallelism value to be used to bootstrap data into hudi");
+      .withDocumentation("For metadata-only bootstrap, Hudi parallelizes the operation so that "
+          + "each table partition is handled by one Spark task. This config limits the number "
+          + "of parallelism. We pick the configured parallelism if the number of table partitions "
+          + "is larger than this configured value. The parallelism is assigned to the number of "
+          + "table partitions if it is smaller than the configured value. For full-record "
+          + "bootstrap, i.e., BULK_INSERT operation of the records, this configured value is "
+          + "passed as the BULK_INSERT shuffle parallelism (`hoodie.bulkinsert.shuffle.parallelism`), "
+          + "determining the BULK_INSERT write behavior. If you see that the bootstrap is slow "
+          + "due to the limited parallelism, you can increase this.");
 
   public static final ConfigProperty<String> PARTITION_SELECTOR_REGEX_PATTERN = ConfigProperty
       .key("hoodie.bootstrap.mode.selector.regex")
diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java
index e3524f4d709..4ddd481b328 100644
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java
@@ -368,9 +368,10 @@ public class SparkBootstrapCommitActionExecutor<T>
         .collect(Collectors.toList());
 
     context.setJobStatus(this.getClass().getSimpleName(), "Run metadata-only bootstrap operation: " + config.getTableName());
-    return context.parallelize(bootstrapPaths, config.getBootstrapParallelism())
+    return context.parallelize(
+            bootstrapPaths, Math.min(bootstrapPaths.size(), config.getBootstrapParallelism()))
         .map(partitionFsPair -> getMetadataHandler(config, table, partitionFsPair.getRight().getRight()).runMetadataBootstrap(partitionFsPair.getLeft(),
-                partitionFsPair.getRight().getLeft(), keyGenerator));
+            partitionFsPair.getRight().getLeft(), keyGenerator));
   }
 
   @Override