You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2022/09/23 13:48:32 UTC

[GitHub] [hudi] parisni commented on a diff in pull request #6498: [HUDI-4878] Fix incremental cleaner use case

parisni commented on code in PR #6498:
URL: https://github.com/apache/hudi/pull/6498#discussion_r978689540


##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java:
##########
@@ -148,23 +148,52 @@ public List<String> getPartitionPathsToClean(Option<HoodieInstant> earliestRetai
    * @return list of partitions
    * @throws IOException
    */
-  private List<String> getPartitionPathsForCleanByCommits(Option<HoodieInstant> instantToRetain) throws IOException {
-    if (!instantToRetain.isPresent()) {
-      LOG.info("No earliest commit to retain. No need to scan partitions !!");
-      return Collections.emptyList();
-    }
+  private List<String> getPartitionPathsForCleanByCommits(Option<HoodieInstant> instantToRetain, HoodieCleaningPolicy cleaningPolicy) throws IOException {
+    if (cleaningPolicy != HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) {
+      if (!instantToRetain.isPresent()) {
+        LOG.info("No earliest commit to retain. No need to scan partitions !!");
+        return Collections.emptyList();
+      }
 
-    if (config.incrementalCleanerModeEnabled()) {
-      Option<HoodieInstant> lastClean = hoodieTable.getCleanTimeline().filterCompletedInstants().lastInstant();
-      if (lastClean.isPresent()) {
-        if (hoodieTable.getActiveTimeline().isEmpty(lastClean.get())) {
-          hoodieTable.getActiveTimeline().deleteEmptyInstantIfExists(lastClean.get());
-        } else {
-          HoodieCleanMetadata cleanMetadata = TimelineMetadataUtils
+      if (config.incrementalCleanerModeEnabled()) {
+        Option<HoodieInstant> lastClean = hoodieTable.getCleanTimeline().filterCompletedInstants().lastInstant();
+        if (lastClean.isPresent()) {
+          if (hoodieTable.getActiveTimeline().isEmpty(lastClean.get())) {
+            hoodieTable.getActiveTimeline().deleteEmptyInstantIfExists(lastClean.get());
+          } else {
+            HoodieCleanMetadata cleanMetadata = TimelineMetadataUtils
+                .deserializeHoodieCleanMetadata(hoodieTable.getActiveTimeline().getInstantDetails(lastClean.get()).get());
+            if ((cleanMetadata.getEarliestCommitToRetain() != null)
+                && (cleanMetadata.getEarliestCommitToRetain().length() > 0)) {
+              return getPartitionPathsForIncrementalCleaning(cleanMetadata.getEarliestCommitToRetain(), instantToRetain);
+            }
+          }
+        }
+      }
+    } else {
+      // FILE_VERSIONS_RETAINED
+      if (config.incrementalCleanerModeEnabled()) {
+        // find the last completed commit from last clean and mark that as earliest commit to retain.
+        Option<HoodieInstant> lastClean = hoodieTable.getCleanTimeline().filterCompletedInstants().lastInstant();
+        if (lastClean.isPresent()) {
+          if (hoodieTable.getActiveTimeline().isEmpty(lastClean.get())) {
+            hoodieTable.getActiveTimeline().deleteEmptyInstantIfExists(lastClean.get());
+          } else {
+            HoodieCleanMetadata cleanMetadata = null;

Review Comment:
   no need to initilize to null



##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java:
##########
@@ -174,34 +203,39 @@ private List<String> getPartitionPathsForCleanByCommits(Option<HoodieInstant> in
 
   /**
    * Use Incremental Mode for finding partition paths.
-   * @param cleanMetadata
+   * @param previousInstantRetained previous instant retained.
    * @param newInstantToRetain
    * @return
    */
-  private List<String> getPartitionPathsForIncrementalCleaning(HoodieCleanMetadata cleanMetadata,
+  private List<String> getPartitionPathsForIncrementalCleaning(String previousInstantRetained,
       Option<HoodieInstant> newInstantToRetain) {
     LOG.info("Incremental Cleaning mode is enabled. Looking up partition-paths that have since changed "

Review Comment:
   Also log info when brute force in `getPartitionPathsForFullCleaning` to better track what happens in the future ?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org