You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2021/02/04 11:40:25 UTC

[GitHub] [hudi] nsivabalan commented on a change in pull request #2210: [HUDI-1348] Provide option to clean up DFS sources

nsivabalan commented on a change in pull request #2210:
URL: https://github.com/apache/hudi/pull/2210#discussion_r570157010



##########
File path: hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/FileSourceCleaner.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.utilities.sources.helpers;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import static org.apache.hudi.utilities.sources.helpers.DFSPathSelector.Config.ROOT_INPUT_PATH_PROP;
+
+public abstract class FileSourceCleaner {
+  private static final Logger LOG = LogManager.getLogger(FileSourceCleaner.class);
+
+  /**
+   * Configs supported.
+   */
+  public static class Config {
+    private Config() {}
+
+    public static final String FILE_SOURCE_CLEAN_MODE_OPT_KEY = "hoodie.deltastreamer.source.dfs.clean.mode";
+    public static final String DEFAULT_FILE_SOURCE_CLEAN_MODE_OPT_KEY = CleanMode.OFF.name();
+
+    public static final String FILE_SOURCE_CLEAN_NUM_THREADS_OPT_KEY = "hoodie.deltastreamer.source.dfs.clean.numThreads";
+    public static final int DEFAULT_FILE_SOURCE_CLEAN_NUM_THREADS_OPT_VAL = 1;
+
+    public static final String FILE_SOURCE_ARCHIVE_DIR_KEY = "hoodie.deltastreamer.source.dfs.clean.archiveDir";
+  }
+
+  private enum CleanMode {
+    DELETE,
+    ARCHIVE,
+    OFF
+  }
+
+  private final Option<ExecutorService> cleanerPool;
+
+  protected FileSourceCleaner(TypedProperties props) {
+    int numCleanerThreads = props.getInteger(Config.FILE_SOURCE_CLEAN_NUM_THREADS_OPT_KEY,
+        Config.DEFAULT_FILE_SOURCE_CLEAN_NUM_THREADS_OPT_VAL);
+    cleanerPool = (numCleanerThreads > 0) ? Option.of(Executors.newFixedThreadPool(numCleanerThreads)) : Option.empty();
+  }
+
+  /**
+   * Factory method to create FileSourceCleaner based on properties.
+   */
+  public static FileSourceCleaner create(TypedProperties props, FileSystem fs) {
+    final String cleanMode = props.getString(Config.FILE_SOURCE_CLEAN_MODE_OPT_KEY, Config.DEFAULT_FILE_SOURCE_CLEAN_MODE_OPT_KEY);
+    switch (CleanMode.valueOf(cleanMode.toUpperCase())) {
+      case DELETE:
+        return new FileSourceRemover(props, fs);
+      case ARCHIVE:
+        return new FileSourceArchiver(props, fs);
+      case OFF:
+        return new FileSourceCleanerNoOp(props);
+      default:
+        throw new IllegalArgumentException(String.format("Unknown option %s for %s. Available options are: "
+            + "delete, archive, off(default)", cleanMode, Config.FILE_SOURCE_CLEAN_MODE_OPT_KEY));
+    }
+  }
+
+  /**
+   * Clean up a file that has been ingested successfully.
+   */
+  public void clean(String file) {
+    if (cleanerPool.isPresent()) {
+      cleanerPool.get().submit(() -> cleanTask(file));
+    } else {
+      cleanTask(file);
+    }
+  }
+
+  abstract void cleanTask(String file);
+
+  private static class FileSourceRemover extends FileSourceCleaner {
+    private final FileSystem fs;
+    public FileSourceRemover(TypedProperties props, FileSystem fs) {
+      super(props);
+      this.fs = fs;
+    }
+
+    @Override
+    void cleanTask(String file) {
+      LOG.info(String.format("Removing %s...", file));
+      try {
+        if (fs.delete(new Path(file), false)) {
+          LOG.info(String.format("Successfully remove up %s", file));
+        } else {
+          LOG.warn(String.format("Failed to remove %s", file));
+        }
+      } catch (IOException e) {
+        LOG.error(String.format("Failed to remove %s", file), e);
+      }
+    }
+  }
+
+  private static class FileSourceArchiver extends FileSourceCleaner {
+    private final FileSystem fs;
+    private final Path archiveDir;
+    private final Path sourceRootDir;
+
+    public FileSourceArchiver(TypedProperties props, FileSystem fs) {
+      super(props);
+      this.fs = fs;
+      this.archiveDir = new Path(props.getString(Config.FILE_SOURCE_ARCHIVE_DIR_KEY));
+      this.sourceRootDir = new Path(props.getString(ROOT_INPUT_PATH_PROP));
+      ValidationUtils.checkArgument(!isSubDir(archiveDir, sourceRootDir),
+          String.format("%s must not be child of %s", Config.FILE_SOURCE_ARCHIVE_DIR_KEY, ROOT_INPUT_PATH_PROP));
+    }
+
+    private boolean isSubDir(Path childDir, Path parentDir) {
+      while (childDir != null) {
+        if (childDir.equals(parentDir)) {
+          return true;
+        }
+        childDir = childDir.getParent();
+      }
+      return false;
+    }
+
+    @Override
+    void cleanTask(String file) {
+      try {
+        final Path original = new Path(file);
+        final Path fileDir = original.getParent();
+        Path relativeDir = getRelativeDir(fileDir, sourceRootDir);
+        final Path newDir = new Path(archiveDir, relativeDir);
+        LOG.info("Creating directory if not existent: " + newDir.toString());
+        fs.mkdirs(newDir);
+
+        final Path newFile = new Path(newDir, original.getName());
+        LOG.info(String.format("Renaming: %s to %s", original.toString(), newFile));
+        if (fs.rename(original, newFile)) {
+          LOG.info(String.format("Successfully archive %s", file));
+        } else {
+          LOG.warn(String.format("Failed to archive %s", file));
+        }
+      } catch (IOException e) {
+        LOG.error(String.format("Failed to archive %s", file), e);
+      }
+    }
+
+    private Path getRelativeDir(Path childPath, Path parentPath) {
+      LinkedList<String> paths = new LinkedList<>();
+      while (childPath != null && !childPath.equals(parentPath)) {
+        paths.addFirst(childPath.getName());
+        childPath = childPath.getParent();
+      }
+      return new Path(paths.isEmpty() ? "." : String.join("/", paths));

Review comment:
       thanks for clarifying. actually we can leave it as is. 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org