You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2021/01/04 02:35:09 UTC

[GitHub] [hudi] rmpifer commented on a change in pull request #2398: [HUDI-842] Implementation of HUDI RFC-15.

rmpifer commented on a change in pull request #2398:
URL: https://github.com/apache/hudi/pull/2398#discussion_r551070357



##########
File path: hudi-cli/src/main/java/org/apache/hudi/cli/commands/MetadataCommand.java
##########
@@ -0,0 +1,226 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.cli.commands;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.cli.HoodieCLI;
+import org.apache.hudi.cli.utils.SparkUtil;
+import org.apache.hudi.client.common.HoodieSparkEngineContext;
+import org.apache.hudi.common.config.HoodieMetadataConfig;
+import org.apache.hudi.common.util.HoodieTimer;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.metadata.HoodieBackedTableMetadata;
+import org.apache.hudi.metadata.HoodieTableMetadata;
+import org.apache.hudi.metrics.SparkHoodieBackedTableMetadataWriter;
+
+import org.apache.spark.api.java.JavaSparkContext;
+import org.springframework.shell.core.CommandMarker;
+import org.springframework.shell.core.annotation.CliCommand;
+import org.springframework.shell.core.annotation.CliOption;
+import org.springframework.stereotype.Component;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * CLI commands to operate on the Metadata Table.
+ */
+@Component
+public class MetadataCommand implements CommandMarker {
+
+  private JavaSparkContext jsc;
+  private static String metadataBaseDirectory;
+
+  /**
+   * Sets the directory to store/read Metadata Table.
+   *
+   * This can be used to store the metadata table away from the dataset directory.
+   *  - Useful for testing as well as for using via the HUDI CLI so that the actual dataset is not written to.
+   *  - Useful for testing Metadata Table performance and operations on existing datasets before enabling.
+   */
+  public static void setMetadataBaseDirectory(String metadataDir) {
+    ValidationUtils.checkState(metadataBaseDirectory == null,
+        "metadataBaseDirectory is already set to " + metadataBaseDirectory);
+    metadataBaseDirectory = metadataDir;
+  }
+
+  public static String getMetadataTableBasePath(String tableBasePath) {
+    if (metadataBaseDirectory != null) {
+      return metadataBaseDirectory;
+    }
+    return HoodieTableMetadata.getMetadataTableBasePath(tableBasePath);
+  }
+
+  @CliCommand(value = "metadata set", help = "Set options for Metadata Table")
+  public String set(@CliOption(key = {"metadataDir"},
+      help = "Directory to read/write metadata table (can be different from dataset)", unspecifiedDefaultValue = "")
+                    final String metadataDir) {
+    if (!metadataDir.isEmpty()) {
+      setMetadataBaseDirectory(metadataDir);
+    }
+
+    return "Ok";
+  }

Review comment:
       Does this make sense to have this configurability in CLI? During write operations this field is not configurable so metadata will always be updated at `tableBasePath + '.hoodie/metadata/'`

##########
File path: hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordScanner.java
##########
@@ -202,8 +203,7 @@ public void scan() {
                     LOG.info("Rolling back the last corrupted log block read in " + logFile.getPath());
                     currentInstantLogBlocks.pop();
                     numBlocksRolledBack++;
-                  } else if (lastBlock.getBlockType() != CORRUPT_BLOCK

Review comment:
       Is this condition `blockType != CORRUPT_BLOCK` ok to remove?

##########
File path: hudi-cli/src/main/java/org/apache/hudi/cli/commands/MetadataCommand.java
##########
@@ -0,0 +1,226 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.cli.commands;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.cli.HoodieCLI;
+import org.apache.hudi.cli.utils.SparkUtil;
+import org.apache.hudi.client.common.HoodieSparkEngineContext;
+import org.apache.hudi.common.config.HoodieMetadataConfig;
+import org.apache.hudi.common.util.HoodieTimer;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.metadata.HoodieBackedTableMetadata;
+import org.apache.hudi.metadata.HoodieTableMetadata;
+import org.apache.hudi.metrics.SparkHoodieBackedTableMetadataWriter;
+
+import org.apache.spark.api.java.JavaSparkContext;
+import org.springframework.shell.core.CommandMarker;
+import org.springframework.shell.core.annotation.CliCommand;
+import org.springframework.shell.core.annotation.CliOption;
+import org.springframework.stereotype.Component;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * CLI commands to operate on the Metadata Table.
+ */
+@Component
+public class MetadataCommand implements CommandMarker {
+
+  private JavaSparkContext jsc;
+  private static String metadataBaseDirectory;
+
+  /**
+   * Sets the directory to store/read Metadata Table.
+   *
+   * This can be used to store the metadata table away from the dataset directory.
+   *  - Useful for testing as well as for using via the HUDI CLI so that the actual dataset is not written to.
+   *  - Useful for testing Metadata Table performance and operations on existing datasets before enabling.
+   */
+  public static void setMetadataBaseDirectory(String metadataDir) {
+    ValidationUtils.checkState(metadataBaseDirectory == null,
+        "metadataBaseDirectory is already set to " + metadataBaseDirectory);
+    metadataBaseDirectory = metadataDir;
+  }
+
+  public static String getMetadataTableBasePath(String tableBasePath) {
+    if (metadataBaseDirectory != null) {
+      return metadataBaseDirectory;
+    }
+    return HoodieTableMetadata.getMetadataTableBasePath(tableBasePath);
+  }
+
+  @CliCommand(value = "metadata set", help = "Set options for Metadata Table")
+  public String set(@CliOption(key = {"metadataDir"},
+      help = "Directory to read/write metadata table (can be different from dataset)", unspecifiedDefaultValue = "")
+                    final String metadataDir) {
+    if (!metadataDir.isEmpty()) {
+      setMetadataBaseDirectory(metadataDir);
+    }
+
+    return "Ok";
+  }
+
+  @CliCommand(value = "metadata create", help = "Create the Metadata Table if it does not exist")
+  public String create() throws IOException {
+    HoodieCLI.getTableMetaClient();
+    Path metadataPath = new Path(getMetadataTableBasePath(HoodieCLI.basePath));
+    try {
+      FileStatus[] statuses = HoodieCLI.fs.listStatus(metadataPath);
+      if (statuses.length > 0) {
+        throw new RuntimeException("Metadata directory (" + metadataPath.toString() + ") not empty.");
+      }
+    } catch (FileNotFoundException e) {
+      // Metadata directory does not exist yet
+      HoodieCLI.fs.mkdirs(metadataPath);
+    }
+
+    HoodieTimer timer = new HoodieTimer().startTimer();
+    HoodieWriteConfig writeConfig = getWriteConfig();
+    initJavaSparkContext();
+    SparkHoodieBackedTableMetadataWriter.create(HoodieCLI.conf, writeConfig, new HoodieSparkEngineContext(jsc));
+    return String.format("Created Metadata Table in %s (duration=%.2f secs)", metadataPath, timer.endTimer() / 1000.0);
+  }
+
+  @CliCommand(value = "metadata delete", help = "Remove the Metadata Table")
+  public String delete() throws Exception {
+    HoodieCLI.getTableMetaClient();
+    Path metadataPath = new Path(getMetadataTableBasePath(HoodieCLI.basePath));
+    try {
+      FileStatus[] statuses = HoodieCLI.fs.listStatus(metadataPath);
+      if (statuses.length > 0) {
+        HoodieCLI.fs.delete(metadataPath, true);
+      }
+    } catch (FileNotFoundException e) {
+      // Metadata directory does not exist
+    }
+
+    return String.format("Removed Metdata Table from %s", metadataPath);
+  }
+
+  @CliCommand(value = "metadata init", help = "Update the metadata table from commits since the creation")
+  public String init(@CliOption(key = {"readonly"}, unspecifiedDefaultValue = "false",

Review comment:
       [Minor] Not sure what purpose of `readonly` config is here. This command doesn't return any values. This command seems like its only purpose is to perform some sort of write operation

##########
File path: hudi-common/src/main/java/org/apache/hudi/common/fs/HoodieWrapperFileSystem.java
##########
@@ -389,28 +456,28 @@ public Path resolvePath(Path p) throws IOException {
 
   @Override
   public FSDataInputStream open(Path f) throws IOException {
-    return fileSystem.open(convertToDefaultPath(f));
+    return wrapInputStream(f, fileSystem.open(convertToDefaultPath(f)));
   }
 
   @Override
   public FSDataOutputStream createNonRecursive(Path f, boolean overwrite, int bufferSize, short replication,
-      long blockSize, Progressable progress) throws IOException {
+                                               long blockSize, Progressable progress) throws IOException {
     Path p = convertToDefaultPath(f);
     return wrapOutputStream(p,
         fileSystem.createNonRecursive(p, overwrite, bufferSize, replication, blockSize, progress));
   }
 
   @Override
   public FSDataOutputStream createNonRecursive(Path f, FsPermission permission, boolean overwrite, int bufferSize,
-      short replication, long blockSize, Progressable progress) throws IOException {
+                                               short replication, long blockSize, Progressable progress) throws IOException {
     Path p = convertToDefaultPath(f);
     return wrapOutputStream(p,
         fileSystem.createNonRecursive(p, permission, overwrite, bufferSize, replication, blockSize, progress));
   }
 
   @Override
   public FSDataOutputStream createNonRecursive(Path f, FsPermission permission, EnumSet<CreateFlag> flags,
-      int bufferSize, short replication, long blockSize, Progressable progress) throws IOException {
+                                               int bufferSize, short replication, long blockSize, Progressable progress) throws IOException {
     Path p = convertToDefaultPath(f);

Review comment:
       [Minor] Possible to remove these unneeded indentations

##########
File path: hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
##########
@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.metadata;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.avro.model.HoodieCleanMetadata;
+import org.apache.hudi.avro.model.HoodieCleanerPlan;
+import org.apache.hudi.avro.model.HoodieRestoreMetadata;
+import org.apache.hudi.avro.model.HoodieRollbackMetadata;
+import org.apache.hudi.common.model.HoodieCommitMetadata;
+import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.table.timeline.HoodieTimeline;
+import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
+import org.apache.hudi.common.util.CleanerUtils;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import static org.apache.hudi.metadata.HoodieTableMetadata.NON_PARTITIONED_NAME;
+
+/**
+ * A utility to convert timeline information to metadata table records.
+ */
+public class HoodieTableMetadataUtil {
+
+  private static final Logger LOG = LogManager.getLogger(HoodieTableMetadataUtil.class);
+
+  /**
+   * Converts a timeline instant to metadata table records.
+   *
+   * @param datasetMetaClient The meta client associated with the timeline instant
+   * @param instant to fetch and convert to metadata table records
+   * @return a list of metadata table records
+   * @throws IOException
+   */
+  public static Option<List<HoodieRecord>> convertInstantToMetaRecords(HoodieTableMetaClient datasetMetaClient, HoodieInstant instant, Option<String> lastSyncTs) throws IOException {
+    HoodieTimeline timeline = datasetMetaClient.getActiveTimeline();
+    Option<List<HoodieRecord>> records = Option.empty();
+    ValidationUtils.checkArgument(instant.isCompleted(), "Only completed instants can be synced.");
+
+    switch (instant.getAction()) {
+      case HoodieTimeline.CLEAN_ACTION:
+        HoodieCleanMetadata cleanMetadata = CleanerUtils.getCleanerMetadata(datasetMetaClient, instant);
+        records = Option.of(convertMetadataToRecords(cleanMetadata, instant.getTimestamp()));
+        break;
+      case HoodieTimeline.DELTA_COMMIT_ACTION:
+      case HoodieTimeline.COMMIT_ACTION:
+      case HoodieTimeline.COMPACTION_ACTION:
+        HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(
+            timeline.getInstantDetails(instant).get(), HoodieCommitMetadata.class);
+        records = Option.of(convertMetadataToRecords(commitMetadata, instant.getTimestamp()));
+        break;
+      case HoodieTimeline.ROLLBACK_ACTION:
+        HoodieRollbackMetadata rollbackMetadata = TimelineMetadataUtils.deserializeHoodieRollbackMetadata(
+            timeline.getInstantDetails(instant).get());
+        records = Option.of(convertMetadataToRecords(rollbackMetadata, instant.getTimestamp(), lastSyncTs));
+        break;
+      case HoodieTimeline.RESTORE_ACTION:
+        HoodieRestoreMetadata restoreMetadata = TimelineMetadataUtils.deserializeHoodieRestoreMetadata(
+            timeline.getInstantDetails(instant).get());
+        records = Option.of(convertMetadataToRecords(restoreMetadata, instant.getTimestamp(), lastSyncTs));
+        break;
+      case HoodieTimeline.SAVEPOINT_ACTION:
+        // Nothing to be done here
+        break;
+      default:
+        throw new HoodieException("Unknown type of action " + instant.getAction());

Review comment:
       +1 Are there other new actions we need to be taking into account as well i.e. replace, clustering?

##########
File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java
##########
@@ -242,28 +246,41 @@ public Configuration getHadoopConf() {
    * Get the view of the file system for this table.
    */
   public TableFileSystemView getFileSystemView() {
-    return new HoodieTableFileSystemView(metaClient, getCompletedCommitsTimeline());
+    if (config.useFileListingMetadata()) {
+      return getFileSystemViewInternal(getCompletedCommitsTimeline());
+    } else {
+      return new HoodieTableFileSystemView(metaClient, getCompletedCommitsTimeline());
+    }
   }
 
   /**
    * Get the base file only view of the file system for this table.
    */
   public BaseFileOnlyView getBaseFileOnlyView() {
-    return getViewManager().getFileSystemView(metaClient);
+    return getFileSystemViewInternal(metaClient.getActiveTimeline().filterCompletedAndCompactionInstants());
   }
 
   /**
    * Get the full view of the file system for this table.
    */
   public SliceView getSliceView() {
-    return getViewManager().getFileSystemView(metaClient);
+    return getFileSystemViewInternal(metaClient.getActiveTimeline().filterCompletedAndCompactionInstants());
   }
 
   /**
    * Get complete view of the file system for this table with ability to force sync.
    */
   public SyncableFileSystemView getHoodieView() {
-    return getViewManager().getFileSystemView(metaClient);
+    return getFileSystemViewInternal(metaClient.getActiveTimeline().filterCompletedAndCompactionInstants());
+  }
+
+  private SyncableFileSystemView getFileSystemViewInternal(HoodieTimeline timeline) {
+    if (config.useFileListingMetadata()) {
+      FileSystemViewStorageConfig viewConfig = config.getViewStorageConfig();
+      return new HoodieMetadataFileSystemView(metaClient, this.metadata(), timeline, viewConfig.isIncrementalTimelineSyncEnabled());
+    } else {
+      return getViewManager().getFileSystemView(metaClient);

Review comment:
       [Minor] We could dedup repeated logic by using `HoodieTableFileSystemView.createInMemoryFileSystemView` 

##########
File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java
##########
@@ -242,28 +246,41 @@ public Configuration getHadoopConf() {
    * Get the view of the file system for this table.
    */
   public TableFileSystemView getFileSystemView() {
-    return new HoodieTableFileSystemView(metaClient, getCompletedCommitsTimeline());
+    if (config.useFileListingMetadata()) {
+      return getFileSystemViewInternal(getCompletedCommitsTimeline());
+    } else {
+      return new HoodieTableFileSystemView(metaClient, getCompletedCommitsTimeline());
+    }

Review comment:
       [Minor] `getFileSystemViewInternal` essentially does the same, could we not just directly call here

##########
File path: hudi-cli/src/main/java/org/apache/hudi/cli/commands/MetadataCommand.java
##########
@@ -0,0 +1,226 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.cli.commands;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.cli.HoodieCLI;
+import org.apache.hudi.cli.utils.SparkUtil;
+import org.apache.hudi.client.common.HoodieSparkEngineContext;
+import org.apache.hudi.common.config.HoodieMetadataConfig;
+import org.apache.hudi.common.util.HoodieTimer;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.metadata.HoodieBackedTableMetadata;
+import org.apache.hudi.metadata.HoodieTableMetadata;
+import org.apache.hudi.metrics.SparkHoodieBackedTableMetadataWriter;
+
+import org.apache.spark.api.java.JavaSparkContext;
+import org.springframework.shell.core.CommandMarker;
+import org.springframework.shell.core.annotation.CliCommand;
+import org.springframework.shell.core.annotation.CliOption;
+import org.springframework.stereotype.Component;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * CLI commands to operate on the Metadata Table.
+ */
+@Component
+public class MetadataCommand implements CommandMarker {
+
+  private JavaSparkContext jsc;
+  private static String metadataBaseDirectory;
+
+  /**
+   * Sets the directory to store/read Metadata Table.
+   *
+   * This can be used to store the metadata table away from the dataset directory.
+   *  - Useful for testing as well as for using via the HUDI CLI so that the actual dataset is not written to.
+   *  - Useful for testing Metadata Table performance and operations on existing datasets before enabling.
+   */
+  public static void setMetadataBaseDirectory(String metadataDir) {
+    ValidationUtils.checkState(metadataBaseDirectory == null,
+        "metadataBaseDirectory is already set to " + metadataBaseDirectory);
+    metadataBaseDirectory = metadataDir;
+  }
+
+  public static String getMetadataTableBasePath(String tableBasePath) {
+    if (metadataBaseDirectory != null) {
+      return metadataBaseDirectory;
+    }
+    return HoodieTableMetadata.getMetadataTableBasePath(tableBasePath);
+  }
+
+  @CliCommand(value = "metadata set", help = "Set options for Metadata Table")
+  public String set(@CliOption(key = {"metadataDir"},
+      help = "Directory to read/write metadata table (can be different from dataset)", unspecifiedDefaultValue = "")
+                    final String metadataDir) {
+    if (!metadataDir.isEmpty()) {
+      setMetadataBaseDirectory(metadataDir);
+    }
+
+    return "Ok";
+  }
+
+  @CliCommand(value = "metadata create", help = "Create the Metadata Table if it does not exist")
+  public String create() throws IOException {
+    HoodieCLI.getTableMetaClient();
+    Path metadataPath = new Path(getMetadataTableBasePath(HoodieCLI.basePath));
+    try {
+      FileStatus[] statuses = HoodieCLI.fs.listStatus(metadataPath);
+      if (statuses.length > 0) {
+        throw new RuntimeException("Metadata directory (" + metadataPath.toString() + ") not empty.");
+      }
+    } catch (FileNotFoundException e) {
+      // Metadata directory does not exist yet
+      HoodieCLI.fs.mkdirs(metadataPath);
+    }
+
+    HoodieTimer timer = new HoodieTimer().startTimer();
+    HoodieWriteConfig writeConfig = getWriteConfig();
+    initJavaSparkContext();
+    SparkHoodieBackedTableMetadataWriter.create(HoodieCLI.conf, writeConfig, new HoodieSparkEngineContext(jsc));
+    return String.format("Created Metadata Table in %s (duration=%.2f secs)", metadataPath, timer.endTimer() / 1000.0);
+  }
+
+  @CliCommand(value = "metadata delete", help = "Remove the Metadata Table")
+  public String delete() throws Exception {
+    HoodieCLI.getTableMetaClient();
+    Path metadataPath = new Path(getMetadataTableBasePath(HoodieCLI.basePath));
+    try {
+      FileStatus[] statuses = HoodieCLI.fs.listStatus(metadataPath);
+      if (statuses.length > 0) {
+        HoodieCLI.fs.delete(metadataPath, true);
+      }
+    } catch (FileNotFoundException e) {
+      // Metadata directory does not exist
+    }
+
+    return String.format("Removed Metdata Table from %s", metadataPath);
+  }
+
+  @CliCommand(value = "metadata init", help = "Update the metadata table from commits since the creation")

Review comment:
       [Minor] Maybe rename `metadata sync` based on the information provided in `help`. This command does not initialize the metadata

##########
File path: hudi-common/src/main/java/org/apache/hudi/common/fs/SizeAwareFSDataOutputStream.java
##########
@@ -43,7 +43,7 @@
   private final ConsistencyGuard consistencyGuard;
 
   public SizeAwareFSDataOutputStream(Path path, FSDataOutputStream out, ConsistencyGuard consistencyGuard,
-      Runnable closeCallback) throws IOException {
+                                     Runnable closeCallback) throws IOException {

Review comment:
       [Minor] Unneeded indentation

##########
File path: hudi-cli/src/main/java/org/apache/hudi/cli/commands/MetadataCommand.java
##########
@@ -0,0 +1,226 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.cli.commands;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.cli.HoodieCLI;
+import org.apache.hudi.cli.utils.SparkUtil;
+import org.apache.hudi.client.common.HoodieSparkEngineContext;
+import org.apache.hudi.common.config.HoodieMetadataConfig;
+import org.apache.hudi.common.util.HoodieTimer;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.metadata.HoodieBackedTableMetadata;
+import org.apache.hudi.metadata.HoodieTableMetadata;
+import org.apache.hudi.metrics.SparkHoodieBackedTableMetadataWriter;
+
+import org.apache.spark.api.java.JavaSparkContext;
+import org.springframework.shell.core.CommandMarker;
+import org.springframework.shell.core.annotation.CliCommand;
+import org.springframework.shell.core.annotation.CliOption;
+import org.springframework.stereotype.Component;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * CLI commands to operate on the Metadata Table.
+ */
+@Component
+public class MetadataCommand implements CommandMarker {
+
+  private JavaSparkContext jsc;
+  private static String metadataBaseDirectory;
+
+  /**
+   * Sets the directory to store/read Metadata Table.
+   *
+   * This can be used to store the metadata table away from the dataset directory.
+   *  - Useful for testing as well as for using via the HUDI CLI so that the actual dataset is not written to.
+   *  - Useful for testing Metadata Table performance and operations on existing datasets before enabling.
+   */
+  public static void setMetadataBaseDirectory(String metadataDir) {
+    ValidationUtils.checkState(metadataBaseDirectory == null,
+        "metadataBaseDirectory is already set to " + metadataBaseDirectory);
+    metadataBaseDirectory = metadataDir;
+  }
+
+  public static String getMetadataTableBasePath(String tableBasePath) {
+    if (metadataBaseDirectory != null) {
+      return metadataBaseDirectory;
+    }
+    return HoodieTableMetadata.getMetadataTableBasePath(tableBasePath);
+  }
+
+  @CliCommand(value = "metadata set", help = "Set options for Metadata Table")
+  public String set(@CliOption(key = {"metadataDir"},
+      help = "Directory to read/write metadata table (can be different from dataset)", unspecifiedDefaultValue = "")
+                    final String metadataDir) {
+    if (!metadataDir.isEmpty()) {
+      setMetadataBaseDirectory(metadataDir);
+    }
+
+    return "Ok";
+  }
+
+  @CliCommand(value = "metadata create", help = "Create the Metadata Table if it does not exist")
+  public String create() throws IOException {
+    HoodieCLI.getTableMetaClient();
+    Path metadataPath = new Path(getMetadataTableBasePath(HoodieCLI.basePath));
+    try {
+      FileStatus[] statuses = HoodieCLI.fs.listStatus(metadataPath);
+      if (statuses.length > 0) {
+        throw new RuntimeException("Metadata directory (" + metadataPath.toString() + ") not empty.");
+      }
+    } catch (FileNotFoundException e) {
+      // Metadata directory does not exist yet
+      HoodieCLI.fs.mkdirs(metadataPath);
+    }
+
+    HoodieTimer timer = new HoodieTimer().startTimer();
+    HoodieWriteConfig writeConfig = getWriteConfig();
+    initJavaSparkContext();
+    SparkHoodieBackedTableMetadataWriter.create(HoodieCLI.conf, writeConfig, new HoodieSparkEngineContext(jsc));
+    return String.format("Created Metadata Table in %s (duration=%.2f secs)", metadataPath, timer.endTimer() / 1000.0);
+  }
+
+  @CliCommand(value = "metadata delete", help = "Remove the Metadata Table")
+  public String delete() throws Exception {
+    HoodieCLI.getTableMetaClient();
+    Path metadataPath = new Path(getMetadataTableBasePath(HoodieCLI.basePath));
+    try {
+      FileStatus[] statuses = HoodieCLI.fs.listStatus(metadataPath);

Review comment:
       [Minor] Rather than listing we should be able to just call delete on path if exists




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org