You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@gobblin.apache.org by zi...@apache.org on 2023/05/24 20:47:03 UTC

[gobblin] branch master updated: [GOBBLIN-1825]Hive retention job should fail if deleting underlying files fail (#3687)

This is an automated email from the ASF dual-hosted git repository.

zihanli58 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/gobblin.git


The following commit(s) were added to refs/heads/master by this push:
     new 158d6fff5 [GOBBLIN-1825]Hive retention job should fail if deleting underlying files fail (#3687)
158d6fff5 is described below

commit 158d6fff546ee28ad7b542bfe78e5263af8843c6
Author: meethngala <me...@gmail.com>
AuthorDate: Wed May 24 13:46:56 2023 -0700

    [GOBBLIN-1825]Hive retention job should fail if deleting underlying files fail (#3687)
    
    * fail hive retention if we fail to delete underlying hdfs files
    
    * address PR comments
    
    * address PR comments
    
    ---------
    
    Co-authored-by: Meeth Gala <mg...@linkedin.com>
---
 .../retention/version/HiveDatasetVersionCleaner.java  | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/retention/version/HiveDatasetVersionCleaner.java b/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/retention/version/HiveDatasetVersionCleaner.java
index ffcc3b352..1363be2e0 100644
--- a/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/retention/version/HiveDatasetVersionCleaner.java
+++ b/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/retention/version/HiveDatasetVersionCleaner.java
@@ -88,20 +88,31 @@ public class HiveDatasetVersionCleaner extends VersionCleaner {
       Partition partition = hiveDatasetVersion.getPartition();
       try {
         if (!cleanableHiveDataset.isSimulate()) {
+          // As part of the cleanup process, we want to delete both: hive partition and underlying hdfs files
+          // However, scenarios arise where hive partition is dropped, but hdfs files aren't, leading to dangling files
+          // Thus, we reverse the order of cleaning up hdfs files first and then drop hive partition
+          // In cases where HMS was unresponsive and hive partition couldn't be dropped
+          // re-running hive retention would drop the partition with no hdfs files found to be deleted
+          // or set the flag `isShouldDeleteData` to false
+          if (cleanableHiveDataset.isShouldDeleteData()) {
+            cleanableHiveDataset.getFsCleanableHelper().clean(hiveDatasetVersion, possiblyEmptyDirectories);
+          }
           client.get().dropPartition(partition.getTable().getDbName(), partition.getTable().getTableName(), partition.getValues(), false);
           log.info("Successfully dropped partition " + partition.getCompleteName());
         } else {
           log.info("Simulating drop partition " + partition.getCompleteName());
         }
-        if (cleanableHiveDataset.isShouldDeleteData()) {
-          cleanableHiveDataset.getFsCleanableHelper().clean(hiveDatasetVersion, possiblyEmptyDirectories);
-        }
       } catch (TException | IOException e) {
         log.warn(String.format("Failed to completely delete partition %s.", partition.getCompleteName()), e);
         throw new IOException(e);
       }
     }
-    cleanableHiveDataset.getFsCleanableHelper().cleanEmptyDirectories(possiblyEmptyDirectories, cleanableHiveDataset);
+    try {
+      cleanableHiveDataset.getFsCleanableHelper().cleanEmptyDirectories(possiblyEmptyDirectories, cleanableHiveDataset);
+    } catch (IOException ex) {
+      log.warn(String.format("Failed to delete at least one or more empty directories from total:{%s} with root path %s", possiblyEmptyDirectories.size(), cleanableHiveDataset.datasetRoot()), ex);
+      throw new IOException(ex);
+    }
   }
 
   @Override