You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2022/10/20 04:39:23 UTC

[GitHub] [hudi] nsivabalan commented on a diff in pull request #6498: [HUDI-4878] Fix incremental cleaner use case

nsivabalan commented on code in PR #6498:
URL: https://github.com/apache/hudi/pull/6498#discussion_r980376754


##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java:
##########
@@ -148,23 +148,52 @@ public List<String> getPartitionPathsToClean(Option<HoodieInstant> earliestRetai
    * @return list of partitions
    * @throws IOException
    */
-  private List<String> getPartitionPathsForCleanByCommits(Option<HoodieInstant> instantToRetain) throws IOException {
-    if (!instantToRetain.isPresent()) {
-      LOG.info("No earliest commit to retain. No need to scan partitions !!");
-      return Collections.emptyList();
-    }
+  private List<String> getPartitionPathsForCleanByCommits(Option<HoodieInstant> instantToRetain, HoodieCleaningPolicy cleaningPolicy) throws IOException {
+    if (cleaningPolicy != HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) {
+      if (!instantToRetain.isPresent()) {
+        LOG.info("No earliest commit to retain. No need to scan partitions !!");
+        return Collections.emptyList();
+      }
 
-    if (config.incrementalCleanerModeEnabled()) {
-      Option<HoodieInstant> lastClean = hoodieTable.getCleanTimeline().filterCompletedInstants().lastInstant();
-      if (lastClean.isPresent()) {
-        if (hoodieTable.getActiveTimeline().isEmpty(lastClean.get())) {
-          hoodieTable.getActiveTimeline().deleteEmptyInstantIfExists(lastClean.get());
-        } else {
-          HoodieCleanMetadata cleanMetadata = TimelineMetadataUtils
+      if (config.incrementalCleanerModeEnabled()) {
+        Option<HoodieInstant> lastClean = hoodieTable.getCleanTimeline().filterCompletedInstants().lastInstant();
+        if (lastClean.isPresent()) {
+          if (hoodieTable.getActiveTimeline().isEmpty(lastClean.get())) {
+            hoodieTable.getActiveTimeline().deleteEmptyInstantIfExists(lastClean.get());
+          } else {
+            HoodieCleanMetadata cleanMetadata = TimelineMetadataUtils
+                .deserializeHoodieCleanMetadata(hoodieTable.getActiveTimeline().getInstantDetails(lastClean.get()).get());
+            if ((cleanMetadata.getEarliestCommitToRetain() != null)
+                && (cleanMetadata.getEarliestCommitToRetain().length() > 0)) {
+              return getPartitionPathsForIncrementalCleaning(cleanMetadata.getEarliestCommitToRetain(), instantToRetain);
+            }
+          }
+        }
+      }
+    } else {
+      // FILE_VERSIONS_RETAINED
+      if (config.incrementalCleanerModeEnabled()) {
+        // find the last completed commit from last clean and mark that as earliest commit to retain.
+        Option<HoodieInstant> lastClean = hoodieTable.getCleanTimeline().filterCompletedInstants().lastInstant();
+        if (lastClean.isPresent()) {
+          if (hoodieTable.getActiveTimeline().isEmpty(lastClean.get())) {
+            hoodieTable.getActiveTimeline().deleteEmptyInstantIfExists(lastClean.get());
+          } else {
+            HoodieCleanMetadata cleanMetadata = null;
+            try {
+              cleanMetadata = TimelineMetadataUtils
                   .deserializeHoodieCleanMetadata(hoodieTable.getActiveTimeline().getInstantDetails(lastClean.get()).get());
-          if ((cleanMetadata.getEarliestCommitToRetain() != null)
-                  && (cleanMetadata.getEarliestCommitToRetain().length() > 0)) {
-            return getPartitionPathsForIncrementalCleaning(cleanMetadata, instantToRetain);
+              if (!StringUtils.isNullOrEmpty(cleanMetadata.getLastCompletedCommitTimestamp())) {

Review Comment:
   thats a valid point. lets take this as a separate bug fix. 



##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java:
##########
@@ -148,23 +148,52 @@ public List<String> getPartitionPathsToClean(Option<HoodieInstant> earliestRetai
    * @return list of partitions
    * @throws IOException
    */
-  private List<String> getPartitionPathsForCleanByCommits(Option<HoodieInstant> instantToRetain) throws IOException {
-    if (!instantToRetain.isPresent()) {
-      LOG.info("No earliest commit to retain. No need to scan partitions !!");
-      return Collections.emptyList();
-    }
+  private List<String> getPartitionPathsForCleanByCommits(Option<HoodieInstant> instantToRetain, HoodieCleaningPolicy cleaningPolicy) throws IOException {
+    if (cleaningPolicy != HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) {
+      if (!instantToRetain.isPresent()) {
+        LOG.info("No earliest commit to retain. No need to scan partitions !!");
+        return Collections.emptyList();
+      }
 
-    if (config.incrementalCleanerModeEnabled()) {
-      Option<HoodieInstant> lastClean = hoodieTable.getCleanTimeline().filterCompletedInstants().lastInstant();
-      if (lastClean.isPresent()) {
-        if (hoodieTable.getActiveTimeline().isEmpty(lastClean.get())) {
-          hoodieTable.getActiveTimeline().deleteEmptyInstantIfExists(lastClean.get());
-        } else {
-          HoodieCleanMetadata cleanMetadata = TimelineMetadataUtils
+      if (config.incrementalCleanerModeEnabled()) {
+        Option<HoodieInstant> lastClean = hoodieTable.getCleanTimeline().filterCompletedInstants().lastInstant();
+        if (lastClean.isPresent()) {
+          if (hoodieTable.getActiveTimeline().isEmpty(lastClean.get())) {
+            hoodieTable.getActiveTimeline().deleteEmptyInstantIfExists(lastClean.get());
+          } else {
+            HoodieCleanMetadata cleanMetadata = TimelineMetadataUtils
+                .deserializeHoodieCleanMetadata(hoodieTable.getActiveTimeline().getInstantDetails(lastClean.get()).get());
+            if ((cleanMetadata.getEarliestCommitToRetain() != null)
+                && (cleanMetadata.getEarliestCommitToRetain().length() > 0)) {
+              return getPartitionPathsForIncrementalCleaning(cleanMetadata.getEarliestCommitToRetain(), instantToRetain);
+            }
+          }
+        }
+      }
+    } else {
+      // FILE_VERSIONS_RETAINED
+      if (config.incrementalCleanerModeEnabled()) {
+        // find the last completed commit from last clean and mark that as earliest commit to retain.
+        Option<HoodieInstant> lastClean = hoodieTable.getCleanTimeline().filterCompletedInstants().lastInstant();
+        if (lastClean.isPresent()) {
+          if (hoodieTable.getActiveTimeline().isEmpty(lastClean.get())) {
+            hoodieTable.getActiveTimeline().deleteEmptyInstantIfExists(lastClean.get());
+          } else {
+            HoodieCleanMetadata cleanMetadata = null;
+            try {
+              cleanMetadata = TimelineMetadataUtils
                   .deserializeHoodieCleanMetadata(hoodieTable.getActiveTimeline().getInstantDetails(lastClean.get()).get());
-          if ((cleanMetadata.getEarliestCommitToRetain() != null)
-                  && (cleanMetadata.getEarliestCommitToRetain().length() > 0)) {
-            return getPartitionPathsForIncrementalCleaning(cleanMetadata, instantToRetain);
+              if (!StringUtils.isNullOrEmpty(cleanMetadata.getLastCompletedCommitTimestamp())) {

Review Comment:
   https://issues.apache.org/jira/browse/HUDI-4921



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org