You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by yi...@apache.org on 2023/03/17 22:17:34 UTC
[hudi] branch master updated: [HUDI-5920] Improve documentation of parallelism configs (#8157)

This is an automated email from the ASF dual-hosted git repository.

yihua pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
     new e51b4575cb7 [HUDI-5920] Improve documentation of parallelism configs (#8157)
e51b4575cb7 is described below

commit e51b4575cb7642eb61bcc02d95c99466dd3e8eda
Author: Y Ethan Guo <et...@gmail.com>
AuthorDate: Fri Mar 17 15:17:24 2023 -0700

    [HUDI-5920] Improve documentation of parallelism configs (#8157)
    
    This commit improves the documentation for the following parallelism configs:
    - hoodie.archive.delete.parallelism
    - hoodie.bloom.index.parallelism
    - hoodie.simple.index.parallelism
    - hoodie.global.simple.index.parallelism
    - hoodie.insert.shuffle.parallelism
    - hoodie.bulkinsert.shuffle.parallelism
    - hoodie.upsert.shuffle.parallelism
    - hoodie.delete.shuffle.parallelism
    - hoodie.rollback.parallelism
---
 .../apache/hudi/config/HoodieArchivalConfig.java   |  9 +++-
 .../org/apache/hudi/config/HoodieIndexConfig.java  | 17 ++++++--
 .../org/apache/hudi/config/HoodieWriteConfig.java  | 48 ++++++++++++++++++----
 3 files changed, 63 insertions(+), 11 deletions(-)

diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieArchivalConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieArchivalConfig.java
index 681ca20baee..fdfdab5897a 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieArchivalConfig.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieArchivalConfig.java
@@ -63,7 +63,14 @@ public class HoodieArchivalConfig extends HoodieConfig {
   public static final ConfigProperty<Integer> DELETE_ARCHIVED_INSTANT_PARALLELISM_VALUE = ConfigProperty
       .key("hoodie.archive.delete.parallelism")
       .defaultValue(100)
-      .withDocumentation("Parallelism for deleting archived hoodie commits.");
+      .withDocumentation("When performing archival operation, Hudi needs to delete the files of "
+          + "the archived instants in the active timeline in .hoodie folder. The file deletion "
+          + "also happens after merging small archived files into larger ones if enabled. "
+          + "This config limits the Spark parallelism for deleting files in both cases, i.e., "
+          + "parallelism of deleting files does not go above the configured value and the "
+          + "parallelism is the number of files to delete if smaller than the "
+          + "configured value.  If you see that the file deletion in archival operation is slow "
+          + "because of the limited parallelism, you can increase this to tune the performance.");
 
   public static final ConfigProperty<String> MIN_COMMITS_TO_KEEP = ConfigProperty
       .key("hoodie.keep.min.commits")
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieIndexConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieIndexConfig.java
index 366e6aa4c04..0767fc46a56 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieIndexConfig.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieIndexConfig.java
@@ -115,7 +115,10 @@ public class HoodieIndexConfig extends HoodieConfig {
       .defaultValue("0")
       .withDocumentation("Only applies if index type is BLOOM. "
           + "This is the amount of parallelism for index lookup, which involves a shuffle. "
-          + "By default, this is auto computed based on input workload characteristics.");
+          + "By default, this is auto computed based on input workload characteristics. "
+          + "If the parallelism is explicitly configured by the user, the user-configured "
+          + "value is used in defining the actual parallelism. If the indexing stage is slow "
+          + "due to the limited parallelism, you can increase this to tune the performance.");
 
   public static final ConfigProperty<String> BLOOM_INDEX_PRUNE_BY_RANGES = ConfigProperty
       .key("hoodie.bloom.index.prune.by.ranges")
@@ -181,13 +184,21 @@ public class HoodieIndexConfig extends HoodieConfig {
       .key("hoodie.simple.index.parallelism")
       .defaultValue("100")
       .withDocumentation("Only applies if index type is SIMPLE. "
-          + "This is the amount of parallelism for index lookup, which involves a Spark Shuffle");
+          + "This limits the parallelism of fetching records from the base files of affected "
+          + "partitions. The index picks the configured parallelism if the number of base "
+          + "files is larger than this configured value; otherwise, the number of base files "
+          + "is used as the parallelism. If the indexing stage is slow due to the limited "
+          + "parallelism, you can increase this to tune the performance.");
 
   public static final ConfigProperty<String> GLOBAL_SIMPLE_INDEX_PARALLELISM = ConfigProperty
       .key("hoodie.global.simple.index.parallelism")
       .defaultValue("100")
       .withDocumentation("Only applies if index type is GLOBAL_SIMPLE. "
-          + "This is the amount of parallelism for index lookup, which involves a Spark Shuffle");
+          + "This limits the parallelism of fetching records from the base files of all table "
+          + "partitions. The index picks the configured parallelism if the number of base "
+          + "files is larger than this configured value; otherwise, the number of base files "
+          + "is used as the parallelism. If the indexing stage is slow due to the limited "
+          + "parallelism, you can increase this to tune the performance.");
 
   // 1B bloom filter checks happen in 250 seconds. 500ms to read a bloom filter.
   // 10M checks in 2500ms, thus amortizing the cost of reading bloom filter across partitions.
diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
index b919befbbc9..04d70a85f1a 100644
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
@@ -248,13 +248,29 @@ public class HoodieWriteConfig extends HoodieConfig {
   public static final ConfigProperty<String> INSERT_PARALLELISM_VALUE = ConfigProperty
       .key("hoodie.insert.shuffle.parallelism")
       .defaultValue("0")
-      .withDocumentation("Parallelism for inserting records into the table. Inserts can shuffle data before writing to tune file sizes and optimize the storage layout.");
+      .withDocumentation("Parallelism for inserting records into the table. Inserts can shuffle "
+          + "data before writing to tune file sizes and optimize the storage layout. Before "
+          + "0.13.0 release, if users do not configure it, Hudi would use 200 as the default "
+          + "shuffle parallelism. From 0.13.0 onwards Hudi by default automatically uses the "
+          + "parallelism deduced by Spark based on the source data. If the shuffle parallelism "
+          + "is explicitly configured by the user, the user-configured parallelism is "
+          + "used in defining the actual parallelism. If you observe small files from the insert "
+          + "operation, we suggest configuring this shuffle parallelism explicitly, so that the "
+          + "parallelism is around total_input_data_size/120MB.");
 
   public static final ConfigProperty<String> BULKINSERT_PARALLELISM_VALUE = ConfigProperty
       .key("hoodie.bulkinsert.shuffle.parallelism")
       .defaultValue("0")
-      .withDocumentation("For large initial imports using bulk_insert operation, controls the parallelism to use for sort modes or custom partitioning done"
-          + "before writing records to the table.");
+      .withDocumentation("For large initial imports using bulk_insert operation, controls the "
+          + "parallelism to use for sort modes or custom partitioning done before writing records "
+          + "to the table. Before 0.13.0 release, if users do not configure it, Hudi would use "
+          + "200 as the default shuffle parallelism. From 0.13.0 onwards Hudi by default "
+          + "automatically uses the parallelism deduced by Spark based on the source data or "
+          + "the parallelism based on the logical plan for row writer. If the shuffle parallelism "
+          + "is explicitly configured by the user, the user-configured parallelism is "
+          + "used in defining the actual parallelism. If you observe small files from the bulk insert "
+          + "operation, we suggest configuring this shuffle parallelism explicitly, so that the "
+          + "parallelism is around total_input_data_size/120MB.");
 
   public static final ConfigProperty<String> BULKINSERT_USER_DEFINED_PARTITIONER_SORT_COLUMNS = ConfigProperty
       .key("hoodie.bulkinsert.user.defined.partitioner.sort.columns")
@@ -272,18 +288,36 @@ public class HoodieWriteConfig extends HoodieConfig {
   public static final ConfigProperty<String> UPSERT_PARALLELISM_VALUE = ConfigProperty
       .key("hoodie.upsert.shuffle.parallelism")
       .defaultValue("0")
-      .withDocumentation("Parallelism to use for upsert operation on the table. Upserts can shuffle data to perform index lookups, file sizing, bin packing records optimally"
-          + "into file groups.");
+      .withDocumentation("Parallelism to use for upsert operation on the table. Upserts can "
+          + "shuffle data to perform index lookups, file sizing, bin packing records optimally "
+          + "into file groups. Before 0.13.0 release, "
+          + "if users do not configure it, Hudi would use 200 as the default "
+          + "shuffle parallelism. From 0.13.0 onwards Hudi by default automatically uses the "
+          + "parallelism deduced by Spark based on the source data. If the shuffle parallelism "
+          + "is explicitly configured by the user, the user-configured parallelism is "
+          + "used in defining the actual parallelism. If you observe small files from the upsert "
+          + "operation, we suggest configuring this shuffle parallelism explicitly, so that the "
+          + "parallelism is around total_input_data_size/120MB.");
 
   public static final ConfigProperty<String> DELETE_PARALLELISM_VALUE = ConfigProperty
       .key("hoodie.delete.shuffle.parallelism")
       .defaultValue("0")
-      .withDocumentation("Parallelism used for “delete” operation. Delete operations also performs shuffles, similar to upsert operation.");
+      .withDocumentation("Parallelism used for delete operation. Delete operations also performs "
+          + "shuffles, similar to upsert operation. Before 0.13.0 release, "
+          + "if users do not configure it, Hudi would use 200 as the default "
+          + "shuffle parallelism. From 0.13.0 onwards Hudi by default automatically uses the "
+          + "parallelism deduced by Spark based on the source data. If the shuffle parallelism "
+          + "is explicitly configured by the user, the user-configured parallelism is "
+          + "used in defining the actual parallelism.");
 
   public static final ConfigProperty<String> ROLLBACK_PARALLELISM_VALUE = ConfigProperty
       .key("hoodie.rollback.parallelism")
       .defaultValue("100")
-      .withDocumentation("Parallelism for rollback of commits. Rollbacks perform delete of files or logging delete blocks to file groups on storage in parallel.");
+      .withDocumentation("This config controls the parallelism for rollback of commits. "
+          + "Rollbacks perform deletion of files or logging delete blocks to file groups on "
+          + "storage in parallel. The configure value limits the parallelism so that the number "
+          + "of Spark tasks do not exceed the value. If rollback is slow due to the limited "
+          + "parallelism, you can increase this to tune the performance.");
 
   public static final ConfigProperty<String> WRITE_BUFFER_LIMIT_BYTES_VALUE = ConfigProperty
       .key("hoodie.write.buffer.limit.bytes")