You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2021/01/28 01:44:50 UTC

[GitHub] [hudi] nsivabalan commented on a change in pull request #2210: [HUDI-1348] Provide option to clean up DFS sources

nsivabalan commented on a change in pull request #2210:
URL: https://github.com/apache/hudi/pull/2210#discussion_r565759271



##########
File path: hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/FileSourceCleaner.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.utilities.sources.helpers;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import static org.apache.hudi.utilities.sources.helpers.DFSPathSelector.Config.ROOT_INPUT_PATH_PROP;
+
+public abstract class FileSourceCleaner {
+  private static final Logger LOG = LogManager.getLogger(FileSourceCleaner.class);
+
+  /**
+   * Configs supported.
+   */
+  public static class Config {
+    private Config() {}
+
+    public static final String FILE_SOURCE_CLEAN_MODE_OPT_KEY = "hoodie.deltastreamer.source.dfs.clean.mode";
+    public static final String DEFAULT_FILE_SOURCE_CLEAN_MODE_OPT_KEY = CleanMode.OFF.name();
+
+    public static final String FILE_SOURCE_CLEAN_NUM_THREADS_OPT_KEY = "hoodie.deltastreamer.source.dfs.clean.numThreads";
+    public static final int DEFAULT_FILE_SOURCE_CLEAN_NUM_THREADS_OPT_VAL = 1;
+
+    public static final String FILE_SOURCE_ARCHIVE_DIR_KEY = "hoodie.deltastreamer.source.dfs.clean.archiveDir";
+  }
+
+  private enum CleanMode {
+    DELETE,
+    ARCHIVE,
+    OFF
+  }
+
+  private final Option<ExecutorService> cleanerPool;
+
+  protected FileSourceCleaner(TypedProperties props) {
+    int numCleanerThreads = props.getInteger(Config.FILE_SOURCE_CLEAN_NUM_THREADS_OPT_KEY,
+        Config.DEFAULT_FILE_SOURCE_CLEAN_NUM_THREADS_OPT_VAL);
+    cleanerPool = (numCleanerThreads > 0) ? Option.of(Executors.newFixedThreadPool(numCleanerThreads)) : Option.empty();
+  }
+
+  /**
+   * Factory method to create FileSourceCleaner based on properties.
+   */
+  public static FileSourceCleaner create(TypedProperties props, FileSystem fs) {
+    final String cleanMode = props.getString(Config.FILE_SOURCE_CLEAN_MODE_OPT_KEY, Config.DEFAULT_FILE_SOURCE_CLEAN_MODE_OPT_KEY);
+    switch (CleanMode.valueOf(cleanMode.toUpperCase())) {
+      case DELETE:
+        return new FileSourceRemover(props, fs);
+      case ARCHIVE:
+        return new FileSourceArchiver(props, fs);
+      case OFF:
+        return new FileSourceCleanerNoOp(props);
+      default:
+        throw new IllegalArgumentException(String.format("Unknown option %s for %s. Available options are: "
+            + "delete, archive, off(default)", cleanMode, Config.FILE_SOURCE_CLEAN_MODE_OPT_KEY));
+    }
+  }
+
+  /**
+   * Clean up a file that has been ingested successfully.
+   */
+  public void clean(String file) {
+    if (cleanerPool.isPresent()) {
+      cleanerPool.get().submit(() -> cleanTask(file));
+    } else {
+      cleanTask(file);
+    }
+  }
+
+  abstract void cleanTask(String file);
+
+  private static class FileSourceRemover extends FileSourceCleaner {
+    private final FileSystem fs;
+    public FileSourceRemover(TypedProperties props, FileSystem fs) {
+      super(props);
+      this.fs = fs;
+    }
+
+    @Override
+    void cleanTask(String file) {
+      LOG.info(String.format("Removing %s...", file));
+      try {
+        if (fs.delete(new Path(file), false)) {
+          LOG.info(String.format("Successfully remove up %s", file));
+        } else {
+          LOG.warn(String.format("Failed to remove %s", file));
+        }
+      } catch (IOException e) {
+        LOG.error(String.format("Failed to remove %s", file), e);
+      }
+    }
+  }
+
+  private static class FileSourceArchiver extends FileSourceCleaner {
+    private final FileSystem fs;
+    private final Path archiveDir;
+    private final Path sourceRootDir;
+
+    public FileSourceArchiver(TypedProperties props, FileSystem fs) {
+      super(props);
+      this.fs = fs;
+      this.archiveDir = new Path(props.getString(Config.FILE_SOURCE_ARCHIVE_DIR_KEY));
+      this.sourceRootDir = new Path(props.getString(ROOT_INPUT_PATH_PROP));
+      ValidationUtils.checkArgument(!isSubDir(archiveDir, sourceRootDir),
+          String.format("%s must not be child of %s", Config.FILE_SOURCE_ARCHIVE_DIR_KEY, ROOT_INPUT_PATH_PROP));
+    }
+
+    private boolean isSubDir(Path childDir, Path parentDir) {
+      while (childDir != null) {
+        if (childDir.equals(parentDir)) {
+          return true;
+        }
+        childDir = childDir.getParent();
+      }
+      return false;
+    }
+
+    @Override
+    void cleanTask(String file) {
+      try {
+        final Path original = new Path(file);
+        final Path fileDir = original.getParent();
+        Path relativeDir = getRelativeDir(fileDir, sourceRootDir);
+        final Path newDir = new Path(archiveDir, relativeDir);
+        LOG.info("Creating directory if not existent: " + newDir.toString());
+        fs.mkdirs(newDir);
+
+        final Path newFile = new Path(newDir, original.getName());

Review comment:
       minor: may be we can name this as "archived_file"

##########
File path: hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/FileSourceCleaner.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.utilities.sources.helpers;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import static org.apache.hudi.utilities.sources.helpers.DFSPathSelector.Config.ROOT_INPUT_PATH_PROP;
+
+public abstract class FileSourceCleaner {
+  private static final Logger LOG = LogManager.getLogger(FileSourceCleaner.class);
+
+  /**
+   * Configs supported.
+   */
+  public static class Config {
+    private Config() {}
+
+    public static final String FILE_SOURCE_CLEAN_MODE_OPT_KEY = "hoodie.deltastreamer.source.dfs.clean.mode";
+    public static final String DEFAULT_FILE_SOURCE_CLEAN_MODE_OPT_KEY = CleanMode.OFF.name();
+
+    public static final String FILE_SOURCE_CLEAN_NUM_THREADS_OPT_KEY = "hoodie.deltastreamer.source.dfs.clean.numThreads";
+    public static final int DEFAULT_FILE_SOURCE_CLEAN_NUM_THREADS_OPT_VAL = 1;
+
+    public static final String FILE_SOURCE_ARCHIVE_DIR_KEY = "hoodie.deltastreamer.source.dfs.clean.archiveDir";
+  }
+
+  private enum CleanMode {
+    DELETE,
+    ARCHIVE,
+    OFF
+  }
+
+  private final Option<ExecutorService> cleanerPool;
+
+  protected FileSourceCleaner(TypedProperties props) {
+    int numCleanerThreads = props.getInteger(Config.FILE_SOURCE_CLEAN_NUM_THREADS_OPT_KEY,
+        Config.DEFAULT_FILE_SOURCE_CLEAN_NUM_THREADS_OPT_VAL);
+    cleanerPool = (numCleanerThreads > 0) ? Option.of(Executors.newFixedThreadPool(numCleanerThreads)) : Option.empty();
+  }
+
+  /**
+   * Factory method to create FileSourceCleaner based on properties.
+   */
+  public static FileSourceCleaner create(TypedProperties props, FileSystem fs) {
+    final String cleanMode = props.getString(Config.FILE_SOURCE_CLEAN_MODE_OPT_KEY, Config.DEFAULT_FILE_SOURCE_CLEAN_MODE_OPT_KEY);
+    switch (CleanMode.valueOf(cleanMode.toUpperCase())) {
+      case DELETE:
+        return new FileSourceRemover(props, fs);
+      case ARCHIVE:
+        return new FileSourceArchiver(props, fs);
+      case OFF:
+        return new FileSourceCleanerNoOp(props);
+      default:
+        throw new IllegalArgumentException(String.format("Unknown option %s for %s. Available options are: "
+            + "delete, archive, off(default)", cleanMode, Config.FILE_SOURCE_CLEAN_MODE_OPT_KEY));
+    }
+  }
+
+  /**
+   * Clean up a file that has been ingested successfully.
+   */
+  public void clean(String file) {
+    if (cleanerPool.isPresent()) {
+      cleanerPool.get().submit(() -> cleanTask(file));
+    } else {
+      cleanTask(file);
+    }
+  }
+
+  abstract void cleanTask(String file);
+
+  private static class FileSourceRemover extends FileSourceCleaner {
+    private final FileSystem fs;
+    public FileSourceRemover(TypedProperties props, FileSystem fs) {
+      super(props);
+      this.fs = fs;
+    }
+
+    @Override
+    void cleanTask(String file) {
+      LOG.info(String.format("Removing %s...", file));
+      try {
+        if (fs.delete(new Path(file), false)) {
+          LOG.info(String.format("Successfully remove up %s", file));
+        } else {
+          LOG.warn(String.format("Failed to remove %s", file));
+        }
+      } catch (IOException e) {
+        LOG.error(String.format("Failed to remove %s", file), e);
+      }
+    }
+  }
+
+  private static class FileSourceArchiver extends FileSourceCleaner {
+    private final FileSystem fs;
+    private final Path archiveDir;
+    private final Path sourceRootDir;
+
+    public FileSourceArchiver(TypedProperties props, FileSystem fs) {
+      super(props);
+      this.fs = fs;
+      this.archiveDir = new Path(props.getString(Config.FILE_SOURCE_ARCHIVE_DIR_KEY));
+      this.sourceRootDir = new Path(props.getString(ROOT_INPUT_PATH_PROP));
+      ValidationUtils.checkArgument(!isSubDir(archiveDir, sourceRootDir),
+          String.format("%s must not be child of %s", Config.FILE_SOURCE_ARCHIVE_DIR_KEY, ROOT_INPUT_PATH_PROP));
+    }
+
+    private boolean isSubDir(Path childDir, Path parentDir) {
+      while (childDir != null) {
+        if (childDir.equals(parentDir)) {
+          return true;
+        }
+        childDir = childDir.getParent();
+      }
+      return false;
+    }
+
+    @Override
+    void cleanTask(String file) {
+      try {
+        final Path original = new Path(file);
+        final Path fileDir = original.getParent();
+        Path relativeDir = getRelativeDir(fileDir, sourceRootDir);
+        final Path newDir = new Path(archiveDir, relativeDir);
+        LOG.info("Creating directory if not existent: " + newDir.toString());
+        fs.mkdirs(newDir);

Review comment:
       if(!fs.exists(newDir)) {
      fs.mkdirs(newDir);
   }

##########
File path: hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/CsvDFSSource.java
##########
@@ -125,4 +134,12 @@ public CsvDFSSource(TypedProperties props,
       return Option.empty();
     }
   }
+
+  @Override

Review comment:
       I understand atleast for kafka it may not be applicable. but since we repeat this for almost all sources, can we move this to Source class itself. All impls just need to update inputFiles list alone. rest will be taken care by Source.postCommit(). this also means, that you need to declare fileCleaner and inputList in Source class. 
   

##########
File path: hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/FileSourceCleaner.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.utilities.sources.helpers;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import static org.apache.hudi.utilities.sources.helpers.DFSPathSelector.Config.ROOT_INPUT_PATH_PROP;
+
+public abstract class FileSourceCleaner {
+  private static final Logger LOG = LogManager.getLogger(FileSourceCleaner.class);
+
+  /**
+   * Configs supported.
+   */
+  public static class Config {
+    private Config() {}
+
+    public static final String FILE_SOURCE_CLEAN_MODE_OPT_KEY = "hoodie.deltastreamer.source.dfs.clean.mode";
+    public static final String DEFAULT_FILE_SOURCE_CLEAN_MODE_OPT_KEY = CleanMode.OFF.name();
+
+    public static final String FILE_SOURCE_CLEAN_NUM_THREADS_OPT_KEY = "hoodie.deltastreamer.source.dfs.clean.numThreads";
+    public static final int DEFAULT_FILE_SOURCE_CLEAN_NUM_THREADS_OPT_VAL = 1;
+
+    public static final String FILE_SOURCE_ARCHIVE_DIR_KEY = "hoodie.deltastreamer.source.dfs.clean.archiveDir";
+  }
+
+  private enum CleanMode {
+    DELETE,
+    ARCHIVE,
+    OFF
+  }
+
+  private final Option<ExecutorService> cleanerPool;
+
+  protected FileSourceCleaner(TypedProperties props) {
+    int numCleanerThreads = props.getInteger(Config.FILE_SOURCE_CLEAN_NUM_THREADS_OPT_KEY,
+        Config.DEFAULT_FILE_SOURCE_CLEAN_NUM_THREADS_OPT_VAL);
+    cleanerPool = (numCleanerThreads > 0) ? Option.of(Executors.newFixedThreadPool(numCleanerThreads)) : Option.empty();
+  }
+
+  /**
+   * Factory method to create FileSourceCleaner based on properties.
+   */
+  public static FileSourceCleaner create(TypedProperties props, FileSystem fs) {
+    final String cleanMode = props.getString(Config.FILE_SOURCE_CLEAN_MODE_OPT_KEY, Config.DEFAULT_FILE_SOURCE_CLEAN_MODE_OPT_KEY);
+    switch (CleanMode.valueOf(cleanMode.toUpperCase())) {
+      case DELETE:
+        return new FileSourceRemover(props, fs);
+      case ARCHIVE:
+        return new FileSourceArchiver(props, fs);
+      case OFF:
+        return new FileSourceCleanerNoOp(props);
+      default:
+        throw new IllegalArgumentException(String.format("Unknown option %s for %s. Available options are: "
+            + "delete, archive, off(default)", cleanMode, Config.FILE_SOURCE_CLEAN_MODE_OPT_KEY));
+    }
+  }
+
+  /**
+   * Clean up a file that has been ingested successfully.
+   */
+  public void clean(String file) {
+    if (cleanerPool.isPresent()) {
+      cleanerPool.get().submit(() -> cleanTask(file));
+    } else {
+      cleanTask(file);
+    }
+  }
+
+  abstract void cleanTask(String file);
+
+  private static class FileSourceRemover extends FileSourceCleaner {
+    private final FileSystem fs;
+    public FileSourceRemover(TypedProperties props, FileSystem fs) {
+      super(props);
+      this.fs = fs;
+    }
+
+    @Override
+    void cleanTask(String file) {
+      LOG.info(String.format("Removing %s...", file));
+      try {
+        if (fs.delete(new Path(file), false)) {
+          LOG.info(String.format("Successfully remove up %s", file));
+        } else {
+          LOG.warn(String.format("Failed to remove %s", file));
+        }
+      } catch (IOException e) {
+        LOG.error(String.format("Failed to remove %s", file), e);
+      }
+    }
+  }
+
+  private static class FileSourceArchiver extends FileSourceCleaner {
+    private final FileSystem fs;
+    private final Path archiveDir;
+    private final Path sourceRootDir;
+
+    public FileSourceArchiver(TypedProperties props, FileSystem fs) {
+      super(props);
+      this.fs = fs;
+      this.archiveDir = new Path(props.getString(Config.FILE_SOURCE_ARCHIVE_DIR_KEY));
+      this.sourceRootDir = new Path(props.getString(ROOT_INPUT_PATH_PROP));
+      ValidationUtils.checkArgument(!isSubDir(archiveDir, sourceRootDir),
+          String.format("%s must not be child of %s", Config.FILE_SOURCE_ARCHIVE_DIR_KEY, ROOT_INPUT_PATH_PROP));
+    }
+
+    private boolean isSubDir(Path childDir, Path parentDir) {
+      while (childDir != null) {
+        if (childDir.equals(parentDir)) {
+          return true;
+        }
+        childDir = childDir.getParent();
+      }
+      return false;
+    }
+
+    @Override
+    void cleanTask(String file) {
+      try {
+        final Path original = new Path(file);
+        final Path fileDir = original.getParent();
+        Path relativeDir = getRelativeDir(fileDir, sourceRootDir);
+        final Path newDir = new Path(archiveDir, relativeDir);
+        LOG.info("Creating directory if not existent: " + newDir.toString());
+        fs.mkdirs(newDir);
+
+        final Path newFile = new Path(newDir, original.getName());
+        LOG.info(String.format("Renaming: %s to %s", original.toString(), newFile));
+        if (fs.rename(original, newFile)) {
+          LOG.info(String.format("Successfully archive %s", file));
+        } else {
+          LOG.warn(String.format("Failed to archive %s", file));
+        }
+      } catch (IOException e) {
+        LOG.error(String.format("Failed to archive %s", file), e);
+      }
+    }
+
+    private Path getRelativeDir(Path childPath, Path parentPath) {
+      LinkedList<String> paths = new LinkedList<>();
+      while (childPath != null && !childPath.equals(parentPath)) {
+        paths.addFirst(childPath.getName());
+        childPath = childPath.getParent();
+      }
+      return new Path(paths.isEmpty() ? "." : String.join("/", paths));
+    }
+  }
+
+  private static class FileSourceCleanerNoOp extends FileSourceCleaner {

Review comment:
       @vinothchandar / @bvaradar : Whats good way to go about no-op impl in general. Can we leave fileCleaner as null and call into fileCleaner. cleanTask() only if non-null from the callers. Or have this class, and don't do anything within cleanTask(). 

##########
File path: hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/FileSourceCleaner.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.utilities.sources.helpers;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import static org.apache.hudi.utilities.sources.helpers.DFSPathSelector.Config.ROOT_INPUT_PATH_PROP;
+
+public abstract class FileSourceCleaner {
+  private static final Logger LOG = LogManager.getLogger(FileSourceCleaner.class);
+
+  /**
+   * Configs supported.
+   */
+  public static class Config {
+    private Config() {}
+
+    public static final String FILE_SOURCE_CLEAN_MODE_OPT_KEY = "hoodie.deltastreamer.source.dfs.clean.mode";
+    public static final String DEFAULT_FILE_SOURCE_CLEAN_MODE_OPT_KEY = CleanMode.OFF.name();
+
+    public static final String FILE_SOURCE_CLEAN_NUM_THREADS_OPT_KEY = "hoodie.deltastreamer.source.dfs.clean.numThreads";
+    public static final int DEFAULT_FILE_SOURCE_CLEAN_NUM_THREADS_OPT_VAL = 1;
+
+    public static final String FILE_SOURCE_ARCHIVE_DIR_KEY = "hoodie.deltastreamer.source.dfs.clean.archiveDir";
+  }
+
+  private enum CleanMode {
+    DELETE,

Review comment:
       I know its self explanatory. but one line java doc would be nice. 

##########
File path: hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/FileSourceCleaner.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.utilities.sources.helpers;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import static org.apache.hudi.utilities.sources.helpers.DFSPathSelector.Config.ROOT_INPUT_PATH_PROP;
+
+public abstract class FileSourceCleaner {
+  private static final Logger LOG = LogManager.getLogger(FileSourceCleaner.class);
+
+  /**
+   * Configs supported.
+   */
+  public static class Config {
+    private Config() {}
+
+    public static final String FILE_SOURCE_CLEAN_MODE_OPT_KEY = "hoodie.deltastreamer.source.dfs.clean.mode";
+    public static final String DEFAULT_FILE_SOURCE_CLEAN_MODE_OPT_KEY = CleanMode.OFF.name();
+
+    public static final String FILE_SOURCE_CLEAN_NUM_THREADS_OPT_KEY = "hoodie.deltastreamer.source.dfs.clean.numThreads";
+    public static final int DEFAULT_FILE_SOURCE_CLEAN_NUM_THREADS_OPT_VAL = 1;
+
+    public static final String FILE_SOURCE_ARCHIVE_DIR_KEY = "hoodie.deltastreamer.source.dfs.clean.archiveDir";
+  }
+
+  private enum CleanMode {
+    DELETE,
+    ARCHIVE,
+    OFF
+  }
+
+  private final Option<ExecutorService> cleanerPool;
+
+  protected FileSourceCleaner(TypedProperties props) {
+    int numCleanerThreads = props.getInteger(Config.FILE_SOURCE_CLEAN_NUM_THREADS_OPT_KEY,
+        Config.DEFAULT_FILE_SOURCE_CLEAN_NUM_THREADS_OPT_VAL);
+    cleanerPool = (numCleanerThreads > 0) ? Option.of(Executors.newFixedThreadPool(numCleanerThreads)) : Option.empty();
+  }
+
+  /**
+   * Factory method to create FileSourceCleaner based on properties.
+   */
+  public static FileSourceCleaner create(TypedProperties props, FileSystem fs) {
+    final String cleanMode = props.getString(Config.FILE_SOURCE_CLEAN_MODE_OPT_KEY, Config.DEFAULT_FILE_SOURCE_CLEAN_MODE_OPT_KEY);
+    switch (CleanMode.valueOf(cleanMode.toUpperCase())) {
+      case DELETE:
+        return new FileSourceRemover(props, fs);
+      case ARCHIVE:
+        return new FileSourceArchiver(props, fs);
+      case OFF:
+        return new FileSourceCleanerNoOp(props);
+      default:
+        throw new IllegalArgumentException(String.format("Unknown option %s for %s. Available options are: "
+            + "delete, archive, off(default)", cleanMode, Config.FILE_SOURCE_CLEAN_MODE_OPT_KEY));
+    }
+  }
+
+  /**
+   * Clean up a file that has been ingested successfully.
+   */
+  public void clean(String file) {
+    if (cleanerPool.isPresent()) {
+      cleanerPool.get().submit(() -> cleanTask(file));
+    } else {
+      cleanTask(file);
+    }
+  }
+
+  abstract void cleanTask(String file);
+
+  private static class FileSourceRemover extends FileSourceCleaner {
+    private final FileSystem fs;
+    public FileSourceRemover(TypedProperties props, FileSystem fs) {
+      super(props);
+      this.fs = fs;
+    }
+
+    @Override
+    void cleanTask(String file) {
+      LOG.info(String.format("Removing %s...", file));
+      try {
+        if (fs.delete(new Path(file), false)) {
+          LOG.info(String.format("Successfully remove up %s", file));
+        } else {
+          LOG.warn(String.format("Failed to remove %s", file));
+        }
+      } catch (IOException e) {
+        LOG.error(String.format("Failed to remove %s", file), e);
+      }
+    }
+  }
+
+  private static class FileSourceArchiver extends FileSourceCleaner {
+    private final FileSystem fs;

Review comment:
       if fileSystem is used in all cleaner impls, might as well declare it in base class. 

##########
File path: hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/FileSourceCleaner.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.utilities.sources.helpers;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import static org.apache.hudi.utilities.sources.helpers.DFSPathSelector.Config.ROOT_INPUT_PATH_PROP;
+
+public abstract class FileSourceCleaner {
+  private static final Logger LOG = LogManager.getLogger(FileSourceCleaner.class);
+
+  /**
+   * Configs supported.
+   */
+  public static class Config {
+    private Config() {}
+
+    public static final String FILE_SOURCE_CLEAN_MODE_OPT_KEY = "hoodie.deltastreamer.source.dfs.clean.mode";
+    public static final String DEFAULT_FILE_SOURCE_CLEAN_MODE_OPT_KEY = CleanMode.OFF.name();
+
+    public static final String FILE_SOURCE_CLEAN_NUM_THREADS_OPT_KEY = "hoodie.deltastreamer.source.dfs.clean.numThreads";
+    public static final int DEFAULT_FILE_SOURCE_CLEAN_NUM_THREADS_OPT_VAL = 1;
+
+    public static final String FILE_SOURCE_ARCHIVE_DIR_KEY = "hoodie.deltastreamer.source.dfs.clean.archiveDir";
+  }
+
+  private enum CleanMode {
+    DELETE,
+    ARCHIVE,
+    OFF
+  }
+
+  private final Option<ExecutorService> cleanerPool;
+
+  protected FileSourceCleaner(TypedProperties props) {
+    int numCleanerThreads = props.getInteger(Config.FILE_SOURCE_CLEAN_NUM_THREADS_OPT_KEY,
+        Config.DEFAULT_FILE_SOURCE_CLEAN_NUM_THREADS_OPT_VAL);
+    cleanerPool = (numCleanerThreads > 0) ? Option.of(Executors.newFixedThreadPool(numCleanerThreads)) : Option.empty();
+  }
+
+  /**
+   * Factory method to create FileSourceCleaner based on properties.
+   */
+  public static FileSourceCleaner create(TypedProperties props, FileSystem fs) {
+    final String cleanMode = props.getString(Config.FILE_SOURCE_CLEAN_MODE_OPT_KEY, Config.DEFAULT_FILE_SOURCE_CLEAN_MODE_OPT_KEY);
+    switch (CleanMode.valueOf(cleanMode.toUpperCase())) {
+      case DELETE:
+        return new FileSourceRemover(props, fs);
+      case ARCHIVE:
+        return new FileSourceArchiver(props, fs);
+      case OFF:
+        return new FileSourceCleanerNoOp(props);
+      default:
+        throw new IllegalArgumentException(String.format("Unknown option %s for %s. Available options are: "
+            + "delete, archive, off(default)", cleanMode, Config.FILE_SOURCE_CLEAN_MODE_OPT_KEY));
+    }
+  }
+
+  /**
+   * Clean up a file that has been ingested successfully.
+   */
+  public void clean(String file) {
+    if (cleanerPool.isPresent()) {
+      cleanerPool.get().submit(() -> cleanTask(file));
+    } else {
+      cleanTask(file);
+    }
+  }
+
+  abstract void cleanTask(String file);
+
+  private static class FileSourceRemover extends FileSourceCleaner {
+    private final FileSystem fs;
+    public FileSourceRemover(TypedProperties props, FileSystem fs) {
+      super(props);
+      this.fs = fs;
+    }
+
+    @Override
+    void cleanTask(String file) {
+      LOG.info(String.format("Removing %s...", file));
+      try {
+        if (fs.delete(new Path(file), false)) {
+          LOG.info(String.format("Successfully remove up %s", file));
+        } else {
+          LOG.warn(String.format("Failed to remove %s", file));
+        }
+      } catch (IOException e) {
+        LOG.error(String.format("Failed to remove %s", file), e);
+      }
+    }
+  }
+
+  private static class FileSourceArchiver extends FileSourceCleaner {
+    private final FileSystem fs;
+    private final Path archiveDir;
+    private final Path sourceRootDir;
+
+    public FileSourceArchiver(TypedProperties props, FileSystem fs) {
+      super(props);
+      this.fs = fs;
+      this.archiveDir = new Path(props.getString(Config.FILE_SOURCE_ARCHIVE_DIR_KEY));
+      this.sourceRootDir = new Path(props.getString(ROOT_INPUT_PATH_PROP));
+      ValidationUtils.checkArgument(!isSubDir(archiveDir, sourceRootDir),
+          String.format("%s must not be child of %s", Config.FILE_SOURCE_ARCHIVE_DIR_KEY, ROOT_INPUT_PATH_PROP));
+    }
+
+    private boolean isSubDir(Path childDir, Path parentDir) {
+      while (childDir != null) {
+        if (childDir.equals(parentDir)) {
+          return true;
+        }
+        childDir = childDir.getParent();
+      }
+      return false;
+    }
+
+    @Override
+    void cleanTask(String file) {
+      try {
+        final Path original = new Path(file);
+        final Path fileDir = original.getParent();
+        Path relativeDir = getRelativeDir(fileDir, sourceRootDir);
+        final Path newDir = new Path(archiveDir, relativeDir);
+        LOG.info("Creating directory if not existent: " + newDir.toString());
+        fs.mkdirs(newDir);
+
+        final Path newFile = new Path(newDir, original.getName());
+        LOG.info(String.format("Renaming: %s to %s", original.toString(), newFile));
+        if (fs.rename(original, newFile)) {
+          LOG.info(String.format("Successfully archive %s", file));
+        } else {
+          LOG.warn(String.format("Failed to archive %s", file));
+        }
+      } catch (IOException e) {
+        LOG.error(String.format("Failed to archive %s", file), e);
+      }
+    }
+
+    private Path getRelativeDir(Path childPath, Path parentPath) {
+      LinkedList<String> paths = new LinkedList<>();
+      while (childPath != null && !childPath.equals(parentPath)) {
+        paths.addFirst(childPath.getName());
+        childPath = childPath.getParent();
+      }
+      return new Path(paths.isEmpty() ? "." : String.join("/", paths));
+    }
+  }
+
+  private static class FileSourceCleanerNoOp extends FileSourceCleaner {

Review comment:
       what do you think of "FileSourceNoOpCleaner" instead of FileSourceCleanerNoOp? 

##########
File path: hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/FileSourceCleaner.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.utilities.sources.helpers;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import static org.apache.hudi.utilities.sources.helpers.DFSPathSelector.Config.ROOT_INPUT_PATH_PROP;
+
+public abstract class FileSourceCleaner {
+  private static final Logger LOG = LogManager.getLogger(FileSourceCleaner.class);
+
+  /**
+   * Configs supported.
+   */
+  public static class Config {
+    private Config() {}
+
+    public static final String FILE_SOURCE_CLEAN_MODE_OPT_KEY = "hoodie.deltastreamer.source.dfs.clean.mode";
+    public static final String DEFAULT_FILE_SOURCE_CLEAN_MODE_OPT_KEY = CleanMode.OFF.name();
+
+    public static final String FILE_SOURCE_CLEAN_NUM_THREADS_OPT_KEY = "hoodie.deltastreamer.source.dfs.clean.numThreads";
+    public static final int DEFAULT_FILE_SOURCE_CLEAN_NUM_THREADS_OPT_VAL = 1;
+
+    public static final String FILE_SOURCE_ARCHIVE_DIR_KEY = "hoodie.deltastreamer.source.dfs.clean.archiveDir";
+  }
+
+  private enum CleanMode {
+    DELETE,
+    ARCHIVE,
+    OFF
+  }
+
+  private final Option<ExecutorService> cleanerPool;
+
+  protected FileSourceCleaner(TypedProperties props) {
+    int numCleanerThreads = props.getInteger(Config.FILE_SOURCE_CLEAN_NUM_THREADS_OPT_KEY,
+        Config.DEFAULT_FILE_SOURCE_CLEAN_NUM_THREADS_OPT_VAL);
+    cleanerPool = (numCleanerThreads > 0) ? Option.of(Executors.newFixedThreadPool(numCleanerThreads)) : Option.empty();
+  }
+
+  /**
+   * Factory method to create FileSourceCleaner based on properties.
+   */
+  public static FileSourceCleaner create(TypedProperties props, FileSystem fs) {
+    final String cleanMode = props.getString(Config.FILE_SOURCE_CLEAN_MODE_OPT_KEY, Config.DEFAULT_FILE_SOURCE_CLEAN_MODE_OPT_KEY);
+    switch (CleanMode.valueOf(cleanMode.toUpperCase())) {
+      case DELETE:
+        return new FileSourceRemover(props, fs);
+      case ARCHIVE:
+        return new FileSourceArchiver(props, fs);
+      case OFF:
+        return new FileSourceCleanerNoOp(props);
+      default:
+        throw new IllegalArgumentException(String.format("Unknown option %s for %s. Available options are: "
+            + "delete, archive, off(default)", cleanMode, Config.FILE_SOURCE_CLEAN_MODE_OPT_KEY));
+    }
+  }
+
+  /**
+   * Clean up a file that has been ingested successfully.
+   */
+  public void clean(String file) {
+    if (cleanerPool.isPresent()) {
+      cleanerPool.get().submit(() -> cleanTask(file));
+    } else {
+      cleanTask(file);
+    }
+  }
+
+  abstract void cleanTask(String file);
+
+  private static class FileSourceRemover extends FileSourceCleaner {
+    private final FileSystem fs;
+    public FileSourceRemover(TypedProperties props, FileSystem fs) {
+      super(props);
+      this.fs = fs;
+    }
+
+    @Override
+    void cleanTask(String file) {
+      LOG.info(String.format("Removing %s...", file));
+      try {
+        if (fs.delete(new Path(file), false)) {
+          LOG.info(String.format("Successfully remove up %s", file));
+        } else {
+          LOG.warn(String.format("Failed to remove %s", file));
+        }
+      } catch (IOException e) {
+        LOG.error(String.format("Failed to remove %s", file), e);
+      }
+    }
+  }
+
+  private static class FileSourceArchiver extends FileSourceCleaner {
+    private final FileSystem fs;
+    private final Path archiveDir;
+    private final Path sourceRootDir;
+
+    public FileSourceArchiver(TypedProperties props, FileSystem fs) {
+      super(props);
+      this.fs = fs;
+      this.archiveDir = new Path(props.getString(Config.FILE_SOURCE_ARCHIVE_DIR_KEY));
+      this.sourceRootDir = new Path(props.getString(ROOT_INPUT_PATH_PROP));
+      ValidationUtils.checkArgument(!isSubDir(archiveDir, sourceRootDir),
+          String.format("%s must not be child of %s", Config.FILE_SOURCE_ARCHIVE_DIR_KEY, ROOT_INPUT_PATH_PROP));
+    }
+
+    private boolean isSubDir(Path childDir, Path parentDir) {
+      while (childDir != null) {
+        if (childDir.equals(parentDir)) {
+          return true;
+        }
+        childDir = childDir.getParent();
+      }
+      return false;
+    }
+
+    @Override
+    void cleanTask(String file) {
+      try {
+        final Path original = new Path(file);
+        final Path fileDir = original.getParent();
+        Path relativeDir = getRelativeDir(fileDir, sourceRootDir);
+        final Path newDir = new Path(archiveDir, relativeDir);
+        LOG.info("Creating directory if not existent: " + newDir.toString());
+        fs.mkdirs(newDir);
+
+        final Path newFile = new Path(newDir, original.getName());
+        LOG.info(String.format("Renaming: %s to %s", original.toString(), newFile));
+        if (fs.rename(original, newFile)) {
+          LOG.info(String.format("Successfully archive %s", file));
+        } else {
+          LOG.warn(String.format("Failed to archive %s", file));
+        }
+      } catch (IOException e) {
+        LOG.error(String.format("Failed to archive %s", file), e);
+      }
+    }
+
+    private Path getRelativeDir(Path childPath, Path parentPath) {
+      LinkedList<String> paths = new LinkedList<>();
+      while (childPath != null && !childPath.equals(parentPath)) {
+        paths.addFirst(childPath.getName());
+        childPath = childPath.getParent();
+      }
+      return new Path(paths.isEmpty() ? "." : String.join("/", paths));
+    }
+  }
+
+  private static class FileSourceCleanerNoOp extends FileSourceCleaner {
+    protected FileSourceCleanerNoOp(TypedProperties props) {
+      super(props);
+    }
+
+    @Override
+    void cleanTask(String file) {
+      LOG.info("No hoodie.deltastreamer.source.dfs.clean was specified. Leaving source unchanged.");

Review comment:
       do we need this info logging. I feel we can leave it empty

##########
File path: hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/FileSourceCleaner.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.utilities.sources.helpers;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import static org.apache.hudi.utilities.sources.helpers.DFSPathSelector.Config.ROOT_INPUT_PATH_PROP;
+
+public abstract class FileSourceCleaner {
+  private static final Logger LOG = LogManager.getLogger(FileSourceCleaner.class);
+
+  /**
+   * Configs supported.
+   */
+  public static class Config {
+    private Config() {}
+
+    public static final String FILE_SOURCE_CLEAN_MODE_OPT_KEY = "hoodie.deltastreamer.source.dfs.clean.mode";
+    public static final String DEFAULT_FILE_SOURCE_CLEAN_MODE_OPT_KEY = CleanMode.OFF.name();
+
+    public static final String FILE_SOURCE_CLEAN_NUM_THREADS_OPT_KEY = "hoodie.deltastreamer.source.dfs.clean.numThreads";
+    public static final int DEFAULT_FILE_SOURCE_CLEAN_NUM_THREADS_OPT_VAL = 1;
+
+    public static final String FILE_SOURCE_ARCHIVE_DIR_KEY = "hoodie.deltastreamer.source.dfs.clean.archiveDir";
+  }
+
+  private enum CleanMode {
+    DELETE,
+    ARCHIVE,
+    OFF
+  }
+
+  private final Option<ExecutorService> cleanerPool;
+
+  protected FileSourceCleaner(TypedProperties props) {
+    int numCleanerThreads = props.getInteger(Config.FILE_SOURCE_CLEAN_NUM_THREADS_OPT_KEY,
+        Config.DEFAULT_FILE_SOURCE_CLEAN_NUM_THREADS_OPT_VAL);
+    cleanerPool = (numCleanerThreads > 0) ? Option.of(Executors.newFixedThreadPool(numCleanerThreads)) : Option.empty();
+  }
+
+  /**
+   * Factory method to create FileSourceCleaner based on properties.
+   */
+  public static FileSourceCleaner create(TypedProperties props, FileSystem fs) {
+    final String cleanMode = props.getString(Config.FILE_SOURCE_CLEAN_MODE_OPT_KEY, Config.DEFAULT_FILE_SOURCE_CLEAN_MODE_OPT_KEY);
+    switch (CleanMode.valueOf(cleanMode.toUpperCase())) {
+      case DELETE:
+        return new FileSourceRemover(props, fs);
+      case ARCHIVE:
+        return new FileSourceArchiver(props, fs);
+      case OFF:
+        return new FileSourceCleanerNoOp(props);
+      default:
+        throw new IllegalArgumentException(String.format("Unknown option %s for %s. Available options are: "
+            + "delete, archive, off(default)", cleanMode, Config.FILE_SOURCE_CLEAN_MODE_OPT_KEY));
+    }
+  }
+
+  /**
+   * Clean up a file that has been ingested successfully.
+   */
+  public void clean(String file) {
+    if (cleanerPool.isPresent()) {

Review comment:
       do you think it is unwarranted to check if "file" is actually a file and not a dir. if dir, throw exception or something? 

##########
File path: hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/FileSourceCleaner.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.utilities.sources.helpers;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import static org.apache.hudi.utilities.sources.helpers.DFSPathSelector.Config.ROOT_INPUT_PATH_PROP;
+
+public abstract class FileSourceCleaner {

Review comment:
       java docs

##########
File path: hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/FileSourceCleaner.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.utilities.sources.helpers;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import static org.apache.hudi.utilities.sources.helpers.DFSPathSelector.Config.ROOT_INPUT_PATH_PROP;
+
+public abstract class FileSourceCleaner {
+  private static final Logger LOG = LogManager.getLogger(FileSourceCleaner.class);
+
+  /**
+   * Configs supported.
+   */
+  public static class Config {
+    private Config() {}
+
+    public static final String FILE_SOURCE_CLEAN_MODE_OPT_KEY = "hoodie.deltastreamer.source.dfs.clean.mode";
+    public static final String DEFAULT_FILE_SOURCE_CLEAN_MODE_OPT_KEY = CleanMode.OFF.name();
+
+    public static final String FILE_SOURCE_CLEAN_NUM_THREADS_OPT_KEY = "hoodie.deltastreamer.source.dfs.clean.numThreads";
+    public static final int DEFAULT_FILE_SOURCE_CLEAN_NUM_THREADS_OPT_VAL = 1;
+
+    public static final String FILE_SOURCE_ARCHIVE_DIR_KEY = "hoodie.deltastreamer.source.dfs.clean.archiveDir";
+  }
+
+  private enum CleanMode {
+    DELETE,
+    ARCHIVE,
+    OFF
+  }
+
+  private final Option<ExecutorService> cleanerPool;
+
+  protected FileSourceCleaner(TypedProperties props) {
+    int numCleanerThreads = props.getInteger(Config.FILE_SOURCE_CLEAN_NUM_THREADS_OPT_KEY,
+        Config.DEFAULT_FILE_SOURCE_CLEAN_NUM_THREADS_OPT_VAL);
+    cleanerPool = (numCleanerThreads > 0) ? Option.of(Executors.newFixedThreadPool(numCleanerThreads)) : Option.empty();
+  }
+
+  /**
+   * Factory method to create FileSourceCleaner based on properties.
+   */
+  public static FileSourceCleaner create(TypedProperties props, FileSystem fs) {
+    final String cleanMode = props.getString(Config.FILE_SOURCE_CLEAN_MODE_OPT_KEY, Config.DEFAULT_FILE_SOURCE_CLEAN_MODE_OPT_KEY);
+    switch (CleanMode.valueOf(cleanMode.toUpperCase())) {
+      case DELETE:
+        return new FileSourceRemover(props, fs);
+      case ARCHIVE:
+        return new FileSourceArchiver(props, fs);
+      case OFF:
+        return new FileSourceCleanerNoOp(props);
+      default:
+        throw new IllegalArgumentException(String.format("Unknown option %s for %s. Available options are: "
+            + "delete, archive, off(default)", cleanMode, Config.FILE_SOURCE_CLEAN_MODE_OPT_KEY));
+    }
+  }
+
+  /**
+   * Clean up a file that has been ingested successfully.
+   */
+  public void clean(String file) {
+    if (cleanerPool.isPresent()) {
+      cleanerPool.get().submit(() -> cleanTask(file));
+    } else {
+      cleanTask(file);
+    }
+  }
+
+  abstract void cleanTask(String file);
+
+  private static class FileSourceRemover extends FileSourceCleaner {
+    private final FileSystem fs;
+    public FileSourceRemover(TypedProperties props, FileSystem fs) {
+      super(props);
+      this.fs = fs;
+    }
+
+    @Override
+    void cleanTask(String file) {
+      LOG.info(String.format("Removing %s...", file));
+      try {
+        if (fs.delete(new Path(file), false)) {
+          LOG.info(String.format("Successfully remove up %s", file));
+        } else {
+          LOG.warn(String.format("Failed to remove %s", file));
+        }
+      } catch (IOException e) {
+        LOG.error(String.format("Failed to remove %s", file), e);
+      }
+    }
+  }
+
+  private static class FileSourceArchiver extends FileSourceCleaner {
+    private final FileSystem fs;
+    private final Path archiveDir;
+    private final Path sourceRootDir;
+
+    public FileSourceArchiver(TypedProperties props, FileSystem fs) {
+      super(props);
+      this.fs = fs;
+      this.archiveDir = new Path(props.getString(Config.FILE_SOURCE_ARCHIVE_DIR_KEY));
+      this.sourceRootDir = new Path(props.getString(ROOT_INPUT_PATH_PROP));
+      ValidationUtils.checkArgument(!isSubDir(archiveDir, sourceRootDir),
+          String.format("%s must not be child of %s", Config.FILE_SOURCE_ARCHIVE_DIR_KEY, ROOT_INPUT_PATH_PROP));
+    }
+
+    private boolean isSubDir(Path childDir, Path parentDir) {
+      while (childDir != null) {
+        if (childDir.equals(parentDir)) {
+          return true;
+        }
+        childDir = childDir.getParent();
+      }
+      return false;
+    }
+
+    @Override
+    void cleanTask(String file) {
+      try {
+        final Path original = new Path(file);
+        final Path fileDir = original.getParent();
+        Path relativeDir = getRelativeDir(fileDir, sourceRootDir);
+        final Path newDir = new Path(archiveDir, relativeDir);
+        LOG.info("Creating directory if not existent: " + newDir.toString());
+        fs.mkdirs(newDir);
+
+        final Path newFile = new Path(newDir, original.getName());
+        LOG.info(String.format("Renaming: %s to %s", original.toString(), newFile));
+        if (fs.rename(original, newFile)) {
+          LOG.info(String.format("Successfully archive %s", file));
+        } else {
+          LOG.warn(String.format("Failed to archive %s", file));
+        }
+      } catch (IOException e) {
+        LOG.error(String.format("Failed to archive %s", file), e);
+      }
+    }
+
+    private Path getRelativeDir(Path childPath, Path parentPath) {
+      LinkedList<String> paths = new LinkedList<>();
+      while (childPath != null && !childPath.equals(parentPath)) {
+        paths.addFirst(childPath.getName());
+        childPath = childPath.getParent();
+      }
+      return new Path(paths.isEmpty() ? "." : String.join("/", paths));

Review comment:
       if file to be archived is in source root dir itself, this moves it to "archive_folder/./" right. May I know the rational ? 

##########
File path: hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/FileSourceCleaner.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.utilities.sources.helpers;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import static org.apache.hudi.utilities.sources.helpers.DFSPathSelector.Config.ROOT_INPUT_PATH_PROP;
+
+public abstract class FileSourceCleaner {

Review comment:
       just trying to understand. Can cleaning be applicable only for files? thinking if we should name this SourceCleaner instead of fileSourceCleaner and have 3 cleaners for file deletions, file archiving and a no op one. 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org