You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2021/09/01 20:42:50 UTC
[GitHub] [hudi] vinothchandar commented on a change in pull request #3426: [HUDI-2285] Metadata Table synchronous design

vinothchandar commented on a change in pull request #3426:
URL: https://github.com/apache/hudi/pull/3426#discussion_r700518633



##########
File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieWriteClient.java
##########
@@ -187,6 +188,7 @@ public boolean commitStats(String instantTime, List<HoodieWriteStat> stats, Opti
         lastCompletedTxnAndMetadata.isPresent() ? Option.of(lastCompletedTxnAndMetadata.get().getLeft()) : Option.empty());
     try {
       preCommit(instantTime, metadata);
+      table.getMetadataWriter().ifPresent(w -> ((HoodieTableMetadataWriter)w).update(metadata, instantTime));

Review comment:
       nts: first committing to metadata table

##########
File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
##########
@@ -110,23 +119,31 @@ protected HoodieBackedTableMetadataWriter(Configuration hadoopConf, HoodieWriteC
       ValidationUtils.checkArgument(!this.metadataWriteConfig.useFileListingMetadata(), "File listing cannot be used for Metadata Table");
 
       initRegistry();
-      HoodieTableMetaClient datasetMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(datasetWriteConfig.getBasePath()).build();
-      initialize(engineContext, datasetMetaClient);
-      if (enabled) {
-        // This is always called even in case the table was created for the first time. This is because
-        // initFromFilesystem() does file listing and hence may take a long time during which some new updates
-        // may have occurred on the table. Hence, calling this always ensures that the metadata is brought in sync
-        // with the active timeline.
-        HoodieTimer timer = new HoodieTimer().startTimer();
-        syncFromInstants(datasetMetaClient);
-        metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.SYNC_STR, timer.endTimer()));
-      }
+      this.datasetMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(datasetWriteConfig.getBasePath()).build();

Review comment:
       nts: loading this afresh here. 

##########
File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
##########
@@ -396,37 +408,56 @@ private boolean bootstrapFromFilesystem(HoodieEngineContext engineContext, Hoodi
   }
 
   /**
-   * Sync the Metadata Table from the instants created on the dataset.
+   * Initialize shards for a partition.
    *
-   * @param datasetMetaClient {@code HoodieTableMetaClient} for the dataset
+   * Each shard is a single log file with the following format:
+   *    <fileIdPrefix>ABCD
+   * where ABCD are digits. This allows up to 9999 shards.
+   *
+   * Example:
+   *    fc9f18eb-6049-4f47-bc51-23884bef0001
+   *    fc9f18eb-6049-4f47-bc51-23884bef0002
    */
-  private void syncFromInstants(HoodieTableMetaClient datasetMetaClient) {
-    ValidationUtils.checkState(enabled, "Metadata table cannot be synced as it is not enabled");
-    // (re) init the metadata for reading.
-    initTableMetadata();
-    try {
-      List<HoodieInstant> instantsToSync = metadata.findInstantsToSyncForWriter();
-      if (instantsToSync.isEmpty()) {
-        return;
-      }
-
-      LOG.info("Syncing " + instantsToSync.size() + " instants to metadata table: " + instantsToSync);
-
-      // Read each instant in order and sync it to metadata table
-      for (HoodieInstant instant : instantsToSync) {
-        LOG.info("Syncing instant " + instant + " to metadata table");
-
-        Option<List<HoodieRecord>> records = HoodieTableMetadataUtil.convertInstantToMetaRecords(datasetMetaClient, instant, getLatestSyncedInstantTime());
-        if (records.isPresent()) {
-          commit(records.get(), MetadataPartitionType.FILES.partitionPath(), instant.getTimestamp());
-        }
+  private void initializeShards(HoodieTableMetaClient datasetMetaClient, String partition, String instantTime,
+      int shardCount) throws IOException {
+    ValidationUtils.checkArgument(shardCount <= 9999, "Maximum 9999 shards are supported.");
+
+    final String newFileId = FSUtils.createNewFileIdPfx();
+    final String newFileIdPrefix = newFileId.substring(0, 32);
+    final HashMap<HeaderMetadataType, String> blockHeader = new HashMap<>();
+    blockHeader.put(HeaderMetadataType.INSTANT_TIME, instantTime);
+    final HoodieDeleteBlock block = new HoodieDeleteBlock(new HoodieKey[0], blockHeader);
+
+    LOG.info(String.format("Creating %d shards for partition %s with base fileId %s at instant time %s",
+        shardCount, partition, newFileId, instantTime));
+    for (int i = 0; i < shardCount; ++i) {
+      // Generate a indexed fileId for each shard and write a log block into it to create the file.
+      final String shardFileId = String.format("%s%04d", newFileIdPrefix, i + 1);
+      ValidationUtils.checkArgument(newFileId.length() == shardFileId.length(), "FileId should be of length " + newFileId.length());
+      try {
+        HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder()
+            .onParentPath(FSUtils.getPartitionPath(metadataWriteConfig.getBasePath(), partition))
+            .withFileId(shardFileId).overBaseCommit(instantTime)
+            .withLogVersion(HoodieLogFile.LOGFILE_BASE_VERSION)
+            .withFileSize(0L)
+            .withSizeThreshold(metadataWriteConfig.getLogFileMaxSize())
+            .withFs(datasetMetaClient.getFs())
+            .withRolloverLogWriteToken(FSUtils.makeWriteToken(0, 0, 0))
+            .withLogWriteToken(FSUtils.makeWriteToken(0, 0, 0))
+            .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
+        writer.appendBlock(block);
+        writer.close();
+      } catch (InterruptedException e) {
+        throw new IOException("Failed to created record level index shard " + shardFileId, e);
       }
-      initTableMetadata();
-    } catch (IOException ioe) {
-      throw new HoodieIOException("Unable to sync instants from data to metadata table.", ioe);
     }
   }
 
+  protected String getShardFileName(String fileId, int shardIndex) {
+    ValidationUtils.checkArgument(shardIndex <= 9999, "Maximum 9999 shards are supported.");

Review comment:
       `9999`

##########
File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
##########
@@ -396,37 +408,56 @@ private boolean bootstrapFromFilesystem(HoodieEngineContext engineContext, Hoodi
   }
 
   /**
-   * Sync the Metadata Table from the instants created on the dataset.
+   * Initialize shards for a partition.
    *
-   * @param datasetMetaClient {@code HoodieTableMetaClient} for the dataset
+   * Each shard is a single log file with the following format:
+   *    <fileIdPrefix>ABCD
+   * where ABCD are digits. This allows up to 9999 shards.
+   *
+   * Example:
+   *    fc9f18eb-6049-4f47-bc51-23884bef0001
+   *    fc9f18eb-6049-4f47-bc51-23884bef0002
    */
-  private void syncFromInstants(HoodieTableMetaClient datasetMetaClient) {
-    ValidationUtils.checkState(enabled, "Metadata table cannot be synced as it is not enabled");
-    // (re) init the metadata for reading.
-    initTableMetadata();
-    try {
-      List<HoodieInstant> instantsToSync = metadata.findInstantsToSyncForWriter();
-      if (instantsToSync.isEmpty()) {
-        return;
-      }
-
-      LOG.info("Syncing " + instantsToSync.size() + " instants to metadata table: " + instantsToSync);
-
-      // Read each instant in order and sync it to metadata table
-      for (HoodieInstant instant : instantsToSync) {
-        LOG.info("Syncing instant " + instant + " to metadata table");
-
-        Option<List<HoodieRecord>> records = HoodieTableMetadataUtil.convertInstantToMetaRecords(datasetMetaClient, instant, getLatestSyncedInstantTime());
-        if (records.isPresent()) {
-          commit(records.get(), MetadataPartitionType.FILES.partitionPath(), instant.getTimestamp());
-        }
+  private void initializeShards(HoodieTableMetaClient datasetMetaClient, String partition, String instantTime,
+      int shardCount) throws IOException {
+    ValidationUtils.checkArgument(shardCount <= 9999, "Maximum 9999 shards are supported.");
+
+    final String newFileId = FSUtils.createNewFileIdPfx();
+    final String newFileIdPrefix = newFileId.substring(0, 32);
+    final HashMap<HeaderMetadataType, String> blockHeader = new HashMap<>();
+    blockHeader.put(HeaderMetadataType.INSTANT_TIME, instantTime);
+    final HoodieDeleteBlock block = new HoodieDeleteBlock(new HoodieKey[0], blockHeader);
+
+    LOG.info(String.format("Creating %d shards for partition %s with base fileId %s at instant time %s",
+        shardCount, partition, newFileId, instantTime));
+    for (int i = 0; i < shardCount; ++i) {
+      // Generate a indexed fileId for each shard and write a log block into it to create the file.
+      final String shardFileId = String.format("%s%04d", newFileIdPrefix, i + 1);
+      ValidationUtils.checkArgument(newFileId.length() == shardFileId.length(), "FileId should be of length " + newFileId.length());
+      try {
+        HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder()
+            .onParentPath(FSUtils.getPartitionPath(metadataWriteConfig.getBasePath(), partition))
+            .withFileId(shardFileId).overBaseCommit(instantTime)
+            .withLogVersion(HoodieLogFile.LOGFILE_BASE_VERSION)
+            .withFileSize(0L)

Review comment:
       legit?

##########
File path: hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkTable.java
##########
@@ -66,4 +70,20 @@ protected HoodieSparkTable(HoodieWriteConfig config, HoodieEngineContext context
   protected HoodieIndex<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> getIndex(HoodieWriteConfig config, HoodieEngineContext context) {
     return SparkHoodieIndex.createIndex(config);
   }
+
+  @Override
+  public Option<HoodieTableMetadataWriter> getMetadataWriter() {
+    if (!config.useFileListingMetadata()) {
+      return Option.empty();
+    }
+
+    try {
+      if (!metaClient.getFs().exists(new Path(HoodieTableMetadata.getMetadataTableBasePath(metaClient.getBasePath())))) {

Review comment:
       could we avoid this `exists()` somehow?

##########
File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieWriteClient.java
##########
@@ -435,7 +425,6 @@ protected void postCommit(HoodieTable<T, I, K, O> table, HoodieCommitMetadata me
       HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(config, table);
       archiveLog.archiveIfRequired(context);
       autoCleanOnCommit();
-      syncTableMetadata();

Review comment:
       there is no more additional sync process, with this re-design.

##########
File path: hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java
##########
@@ -73,14 +73,13 @@
   // Metadata table's timeline and metaclient
   private HoodieTableMetaClient metaClient;
   private HoodieTableConfig tableConfig;
-  private List<FileSlice> latestFileSystemMetadataSlices;
   // should we reuse the open file handles, across calls
   private final boolean reuse;
 
-
-  // Readers for the base and log file which store the metadata
-  private transient HoodieFileReader<GenericRecord> baseFileReader;
-  private transient HoodieMetadataMergedLogRecordScanner logRecordScanner;
+  // Shards for each partition
+  private Map<String, List<FileSlice>> partitionToShardsMap;

Review comment:
       file groups?

##########
File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
##########
@@ -86,10 +93,12 @@
 
   protected HoodieBackedTableMetadata metadata;
   protected HoodieTableMetaClient metaClient;
+  protected HoodieTableMetaClient datasetMetaClient;
   protected Option<HoodieMetadataMetrics> metrics;
   protected boolean enabled;
   protected SerializableConfiguration hadoopConf;
   protected final transient HoodieEngineContext engineContext;
+  protected TransactionManager txnManager;

Review comment:
       nts: this is so that metadata table itself can take multiple writers committing at the same time.

##########
File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
##########
@@ -464,8 +487,14 @@ public void update(HoodieCleanerPlan cleanerPlan, String instantTime) {
   @Override
   public void update(HoodieCleanMetadata cleanMetadata, String instantTime) {
     if (enabled) {
-      List<HoodieRecord> records = HoodieTableMetadataUtil.convertMetadataToRecords(cleanMetadata, instantTime);
-      commit(records, MetadataPartitionType.FILES.partitionPath(), instantTime);
+      this.txnManager.beginTransaction(Option.of(new HoodieInstant(State.INFLIGHT, HoodieTimeline.DELTA_COMMIT_ACTION, instantTime)),

Review comment:
       can we share this code across these update() method overload

##########
File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
##########
@@ -396,37 +408,56 @@ private boolean bootstrapFromFilesystem(HoodieEngineContext engineContext, Hoodi
   }
 
   /**
-   * Sync the Metadata Table from the instants created on the dataset.
+   * Initialize shards for a partition.
    *
-   * @param datasetMetaClient {@code HoodieTableMetaClient} for the dataset
+   * Each shard is a single log file with the following format:
+   *    <fileIdPrefix>ABCD
+   * where ABCD are digits. This allows up to 9999 shards.
+   *
+   * Example:
+   *    fc9f18eb-6049-4f47-bc51-23884bef0001
+   *    fc9f18eb-6049-4f47-bc51-23884bef0002
    */
-  private void syncFromInstants(HoodieTableMetaClient datasetMetaClient) {
-    ValidationUtils.checkState(enabled, "Metadata table cannot be synced as it is not enabled");
-    // (re) init the metadata for reading.
-    initTableMetadata();
-    try {
-      List<HoodieInstant> instantsToSync = metadata.findInstantsToSyncForWriter();
-      if (instantsToSync.isEmpty()) {
-        return;
-      }
-
-      LOG.info("Syncing " + instantsToSync.size() + " instants to metadata table: " + instantsToSync);
-
-      // Read each instant in order and sync it to metadata table
-      for (HoodieInstant instant : instantsToSync) {
-        LOG.info("Syncing instant " + instant + " to metadata table");
-
-        Option<List<HoodieRecord>> records = HoodieTableMetadataUtil.convertInstantToMetaRecords(datasetMetaClient, instant, getLatestSyncedInstantTime());
-        if (records.isPresent()) {
-          commit(records.get(), MetadataPartitionType.FILES.partitionPath(), instant.getTimestamp());
-        }
+  private void initializeShards(HoodieTableMetaClient datasetMetaClient, String partition, String instantTime,
+      int shardCount) throws IOException {
+    ValidationUtils.checkArgument(shardCount <= 9999, "Maximum 9999 shards are supported.");

Review comment:
       add a constant for `9999`

##########
File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieWriteClient.java
##########
@@ -399,14 +397,6 @@ protected void preWrite(String instantTime, WriteOperationType writeOperationTyp
       HoodieTableMetaClient metaClient) {
     setOperationType(writeOperationType);
     this.lastCompletedTxnAndMetadata = TransactionUtils.getLastCompletedTxnInstantAndMetadata(metaClient);
-    this.txnManager.beginTransaction(Option.of(new HoodieInstant(State.INFLIGHT, metaClient.getCommitActionType(), instantTime)), lastCompletedTxnAndMetadata

Review comment:
       nts: this lock was only being taken for purposes of syncing. So removing this is fine.

##########
File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
##########
@@ -304,6 +315,7 @@ private boolean bootstrapFromFilesystem(HoodieEngineContext engineContext, Hoodi
       .initTable(hadoopConf.get(), metadataWriteConfig.getBasePath());
 
     initTableMetadata();
+    initializeShards(datasetMetaClient, MetadataPartitionType.FILES.partitionPath(), createInstantTime, 1);

Review comment:
       we should try to avoid introducing the new shard terminology. We should have it for bucketing, if thats what we intend it for. 

##########
File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
##########
@@ -492,8 +527,26 @@ public void update(HoodieRestoreMetadata restoreMetadata, String instantTime) {
   @Override
   public void update(HoodieRollbackMetadata rollbackMetadata, String instantTime) {
     if (enabled) {
-      List<HoodieRecord> records = HoodieTableMetadataUtil.convertMetadataToRecords(rollbackMetadata, instantTime, metadata.getSyncedInstantTime());
-      commit(records, MetadataPartitionType.FILES.partitionPath(), instantTime);
+      this.txnManager.beginTransaction(Option.of(new HoodieInstant(State.INFLIGHT, HoodieTimeline.DELTA_COMMIT_ACTION, instantTime)),
+          Option.empty());
+      try {
+        // Is this rollback of an instant that has been synced to the metadata table?
+        String rollbackInstant = rollbackMetadata.getCommitsRollback().get(0);
+        boolean wasSynced = metaClient.getActiveTimeline().containsInstant(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, rollbackInstant));
+        if (!wasSynced) {
+          // A compaction may have taken place on metadata table which would have included this instant being rolled back.

Review comment:
       any scope for simplifyinf this?

##########
File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataWriter.java
##########
@@ -32,18 +36,36 @@
  */
 public interface HoodieTableMetadataWriter extends Serializable, AutoCloseable {
 
-  void update(HoodieCommitMetadata commitMetadata, String instantTime);
-
-  void update(HoodieCleanerPlan cleanerPlan, String instantTime);
+  // Update the metadata table due to a COMMIT operation
+  void update(HoodieCommitMetadata option, String instantTime);
 
+  // Update the metadata table due to a CLEAN operation
   void update(HoodieCleanMetadata cleanMetadata, String instantTime);
 
+  // Update the metadata table due to a RESTORE operation
   void update(HoodieRestoreMetadata restoreMetadata, String instantTime);
 
+  // Update the metadata table due to a ROLLBACK operation
   void update(HoodieRollbackMetadata rollbackMetadata, String instantTime);
 
   /**
    * Return the timestamp of the latest instant synced to the metadata table.
    */
   Option<String> getLatestSyncedInstantTime();
+
+  /**
+   * Remove the metadata table for the dataset.
+   *
+   * @param basePath base path of the dataset
+   * @param context
+   */
+  static void removeMetadataTable(String basePath, HoodieEngineContext context) {

Review comment:
       rename `deleteMetadataTable`

##########
File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
##########
@@ -396,37 +408,56 @@ private boolean bootstrapFromFilesystem(HoodieEngineContext engineContext, Hoodi
   }
 
   /**
-   * Sync the Metadata Table from the instants created on the dataset.
+   * Initialize shards for a partition.
    *
-   * @param datasetMetaClient {@code HoodieTableMetaClient} for the dataset
+   * Each shard is a single log file with the following format:
+   *    <fileIdPrefix>ABCD
+   * where ABCD are digits. This allows up to 9999 shards.
+   *
+   * Example:
+   *    fc9f18eb-6049-4f47-bc51-23884bef0001
+   *    fc9f18eb-6049-4f47-bc51-23884bef0002
    */
-  private void syncFromInstants(HoodieTableMetaClient datasetMetaClient) {
-    ValidationUtils.checkState(enabled, "Metadata table cannot be synced as it is not enabled");
-    // (re) init the metadata for reading.
-    initTableMetadata();
-    try {
-      List<HoodieInstant> instantsToSync = metadata.findInstantsToSyncForWriter();
-      if (instantsToSync.isEmpty()) {
-        return;
-      }
-
-      LOG.info("Syncing " + instantsToSync.size() + " instants to metadata table: " + instantsToSync);
-
-      // Read each instant in order and sync it to metadata table
-      for (HoodieInstant instant : instantsToSync) {
-        LOG.info("Syncing instant " + instant + " to metadata table");
-
-        Option<List<HoodieRecord>> records = HoodieTableMetadataUtil.convertInstantToMetaRecords(datasetMetaClient, instant, getLatestSyncedInstantTime());
-        if (records.isPresent()) {
-          commit(records.get(), MetadataPartitionType.FILES.partitionPath(), instant.getTimestamp());
-        }
+  private void initializeShards(HoodieTableMetaClient datasetMetaClient, String partition, String instantTime,
+      int shardCount) throws IOException {
+    ValidationUtils.checkArgument(shardCount <= 9999, "Maximum 9999 shards are supported.");
+
+    final String newFileId = FSUtils.createNewFileIdPfx();
+    final String newFileIdPrefix = newFileId.substring(0, 32);
+    final HashMap<HeaderMetadataType, String> blockHeader = new HashMap<>();
+    blockHeader.put(HeaderMetadataType.INSTANT_TIME, instantTime);
+    final HoodieDeleteBlock block = new HoodieDeleteBlock(new HoodieKey[0], blockHeader);

Review comment:
       we are writing an empty delete block? why? just to create the file group/shard upfront?

##########
File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataWriter.java
##########
@@ -32,18 +36,36 @@
  */
 public interface HoodieTableMetadataWriter extends Serializable, AutoCloseable {
 
-  void update(HoodieCommitMetadata commitMetadata, String instantTime);
-
-  void update(HoodieCleanerPlan cleanerPlan, String instantTime);
+  // Update the metadata table due to a COMMIT operation
+  void update(HoodieCommitMetadata option, String instantTime);

Review comment:
       why `option`?

##########
File path: hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java
##########
@@ -299,6 +302,7 @@ protected void completeCompaction(HoodieCommitMetadata metadata, JavaRDD<WriteSt
     this.context.setJobStatus(this.getClass().getSimpleName(), "Collect compaction write status and commit compaction");
     List<HoodieWriteStat> writeStats = writeStatuses.map(WriteStatus::getStat).collect();
     finalizeWrite(table, compactionCommitTime, writeStats);
+    table.getMetadataWriter().ifPresent(w -> w.update(metadata, compactionCommitTime));

Review comment:
       nts: audit all paths that commit to timeline and ensure this is done everywhere. this may also be an opportunity to streamline such code occurrences.

##########
File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTimelineArchiveLog.java
##########
@@ -200,20 +200,19 @@ public boolean archiveIfRequired(HoodieEngineContext context) throws IOException
         .collect(Collectors.groupingBy(i -> Pair.of(i.getTimestamp(),
             HoodieInstant.getComparableAction(i.getAction()))));
 
-    // If metadata table is enabled, do not archive instants which are more recent that the latest synced
-    // instant on the metadata table. This is required for metadata table sync.
+    // If metadata table is enabled, do not archive instants which are more recent that the last compaction on the
+    // metadata table.
     if (config.useFileListingMetadata()) {
       try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(table.getContext(), config.getMetadataConfig(),
           config.getBasePath(), FileSystemViewStorageConfig.FILESYSTEM_VIEW_SPILLABLE_DIR.defaultValue())) {
-        Option<String> lastSyncedInstantTime = tableMetadata.getSyncedInstantTime();
-
-        if (lastSyncedInstantTime.isPresent()) {
-          LOG.info("Limiting archiving of instants to last synced instant on metadata table at " + lastSyncedInstantTime.get());
-          instants = instants.filter(i -> HoodieTimeline.compareTimestamps(i.getTimestamp(), HoodieTimeline.LESSER_THAN,
-              lastSyncedInstantTime.get()));
-        } else {
-          LOG.info("Not archiving as there is no instants yet on the metadata table");
+        Option<String> latestCompactionTime = tableMetadata.getLatestCompactionTime();

Review comment:
       Need to understand why it matters that metadata table be compacted before it commit can be be archived on the data timelien

##########
File path: hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metadata/SparkHoodieBackedTableMetadataWriter.java
##########
@@ -152,49 +134,57 @@ protected void commit(List<HoodieRecord> records, String partitionName, String i
   }
 
   /**
-   * Tag each record with the location.
+   *  Perform a compaction on the Metadata Table.
    *
-   * Since we only read the latest base file in a partition, we tag the records with the instant time of the latest
-   * base file.
+   * Cases to be handled:
+   *   1. We cannot perform compaction if there are previous inflight operations on the dataset. This is because
+   *      a compacted metadata base file at time Tx should represent all the actions on the dataset till time Tx.
+   *
+   *   2. In multi-writer scenario, a parallel operation with a greater instantTime may have completed creating a
+   *      deltacommit.
    */
-  private JavaRDD<HoodieRecord> prepRecords(List<HoodieRecord> records, String partitionName) {
-    HoodieTable table = HoodieSparkTable.create(metadataWriteConfig, engineContext);
-    TableFileSystemView.SliceView fsView = table.getSliceView();
-    List<HoodieBaseFile> baseFiles = fsView.getLatestFileSlices(partitionName)
-        .map(FileSlice::getBaseFile)
-        .filter(Option::isPresent)
-        .map(Option::get)
-        .collect(Collectors.toList());
-
-    // All the metadata fits within a single base file
-    if (partitionName.equals(MetadataPartitionType.FILES.partitionPath())) {
-      if (baseFiles.size() > 1) {
-        throw new HoodieMetadataException("Multiple base files found in metadata partition");
-      }
+  private void compactIfNecessary(SparkRDDWriteClient writeClient, String instantTime) {
+    String latestDeltacommitTime = metaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants().lastInstant()
+        .get().getTimestamp();
+    List<HoodieInstant> pendingInstants = datasetMetaClient.reloadActiveTimeline().filterInflightsAndRequested()
+        .findInstantsBefore(latestDeltacommitTime).getInstants().collect(Collectors.toList());
+
+    if (!pendingInstants.isEmpty()) {
+      LOG.info(String.format("Cannot compact metadata table as there are %d inflight instants before latest deltacommit %s: %s",
+          pendingInstants.size(), latestDeltacommitTime, Arrays.toString(pendingInstants.toArray())));
+      return;
     }
 
-    JavaSparkContext jsc = ((HoodieSparkEngineContext) engineContext).getJavaSparkContext();
-    String fileId;
-    String instantTime;
-    if (!baseFiles.isEmpty()) {
-      fileId = baseFiles.get(0).getFileId();
-      instantTime = baseFiles.get(0).getCommitTime();
-    } else {
-      // If there is a log file then we can assume that it has the data
-      List<HoodieLogFile> logFiles = fsView.getLatestFileSlices(MetadataPartitionType.FILES.partitionPath())
-          .map(FileSlice::getLatestLogFile)
-          .filter(Option::isPresent)
-          .map(Option::get)
-          .collect(Collectors.toList());
-      if (logFiles.isEmpty()) {
-        // No base and log files. All are new inserts
-        return jsc.parallelize(records, 1);
-      }
-
-      fileId = logFiles.get(0).getFileId();
-      instantTime = logFiles.get(0).getBaseCommitTime();
+    // Trigger compaction with suffixes based on the same instant time. This ensures that any future
+    // delta commits synced over will not have an instant time lesser than the last completed instant on the
+    // metadata table.
+    final String compactionInstantTime = latestDeltacommitTime + "001";
+    if (writeClient.scheduleCompactionAtInstant(compactionInstantTime, Option.empty())) {
+      writeClient.compact(compactionInstantTime);
     }
+  }
+
+  private void cleanIfNecessary(SparkRDDWriteClient writeClient, String instantTime) {

Review comment:
       why the `IfNecessary` part?

##########
File path: hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java
##########
@@ -96,6 +94,11 @@ public SparkRDDWriteClient(HoodieEngineContext context, HoodieWriteConfig writeC
   public SparkRDDWriteClient(HoodieEngineContext context, HoodieWriteConfig writeConfig,
                              Option<EmbeddedTimelineService> timelineService) {
     super(context, writeConfig, timelineService);
+    if (config.useFileListingMetadata()) {

Review comment:
       should we use a flag for metadata table enable/disable ?  not just file listing

##########
File path: hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/OneToTwoUpgradeHandler.java
##########
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.table.upgrade;
+
+import org.apache.hudi.common.engine.HoodieEngineContext;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.metadata.HoodieTableMetadataWriter;
+
+/**
+ * Upgrade handle to assist in upgrading hoodie table from version 1 to 2.
+ */
+public class OneToTwoUpgradeHandler implements UpgradeHandler {

Review comment:
       I thiink this will now be TwoToThree

##########
File path: hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metadata/SparkHoodieBackedTableMetadataWriter.java
##########
@@ -101,7 +94,7 @@ protected void initialize(HoodieEngineContext engineContext, HoodieTableMetaClie
   @Override
   protected void commit(List<HoodieRecord> records, String partitionName, String instantTime) {

Review comment:
       nts: this cannot be a List<> for the record level index

##########
File path: hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/SparkUpgradeDowngrade.java
##########
@@ -48,7 +48,9 @@ public void run(HoodieTableMetaClient metaClient,
 
   @Override
   protected void upgrade(HoodieTableVersion fromVersion, HoodieTableVersion toVersion, String instantTime) {
-    if (fromVersion == HoodieTableVersion.ZERO && toVersion == HoodieTableVersion.ONE) {
+    if (fromVersion == HoodieTableVersion.ONE && toVersion == HoodieTableVersion.TWO) {

Review comment:
       there are some conflicts to be resolved here.

##########
File path: hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java
##########
@@ -40,17 +40,10 @@
   // Enable the internal Metadata Table which saves file listings
   public static final ConfigProperty<Boolean> METADATA_ENABLE_PROP = ConfigProperty
       .key(METADATA_PREFIX + ".enable")
-      .defaultValue(false)
+      .defaultValue(true)

Review comment:
       so all tests pass now?!!!

##########
File path: hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java
##########
@@ -229,116 +196,11 @@ protected BaseTableMetadata(HoodieEngineContext engineContext, HoodieMetadataCon
       statuses = hoodieRecord.get().getData().getFileStatuses(hadoopConf.get(), partitionPath);
     }
 
-    if (metadataConfig.validateFileListingMetadata()) {

Review comment:
       yay!

##########
File path: hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java
##########
@@ -331,4 +301,28 @@ public HoodieTableMetaClient getMetaClient() {
   public Map<String, String> stats() {
     return metrics.map(m -> m.getStats(true, metaClient, this)).orElse(new HashMap<>());
   }
+
+  @Override
+  public Option<String> getSyncedInstantTime() {

Review comment:
       these methods are the same! 

##########
File path: hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
##########
@@ -340,4 +297,77 @@ private static void processRollbackMetadata(HoodieRollbackMetadata rollbackMetad
 
     return records;
   }
+
+  /**
+   * Returns a list of commits which were rolled back as part of a Rollback or Restore operation.
+   *
+   * @param instant The Rollback operation to read
+   * @param timeline
+   */
+  public static List<String> getCommitsRolledback(HoodieInstant instant, HoodieActiveTimeline timeline) {

Review comment:
       this should belong some place else?

##########
File path: hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
##########
@@ -223,11 +174,17 @@
     return convertFilesToRecords(partitionToDeletedFiles, partitionToAppendedFiles, instantTime, "Restore");
   }
 
-  public static List<HoodieRecord> convertMetadataToRecords(HoodieRollbackMetadata rollbackMetadata, String instantTime, Option<String> lastSyncTs) {
+  public static List<HoodieRecord> convertMetadataToRecords(HoodieRollbackMetadata rollbackMetadata, String instantTime,
+      Option<String> lastSyncTs, boolean wasSynced) {
 
     Map<String, Map<String, Long>> partitionToAppendedFiles = new HashMap<>();
     Map<String, List<String>> partitionToDeletedFiles = new HashMap<>();
     processRollbackMetadata(rollbackMetadata, partitionToDeletedFiles, partitionToAppendedFiles, lastSyncTs);
+    if (!wasSynced) {

Review comment:
       nts: need to revisit again with rollback/restore issues fixed

##########
File path: hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java
##########
@@ -162,157 +168,121 @@ private void initIfNeeded() {
       throw new HoodieIOException("Error merging records from metadata table for key :" + key, ioe);
     } finally {
       if (!reuse) {
-        closeOrThrow();
+        close(partitionName);
       }
     }
   }
 
-  private void openReadersIfNeededOrThrow() {
-    try {
-      openReadersIfNeeded();
-    } catch (IOException e) {
-      throw new HoodieIOException("Error opening readers to the Metadata Table: ", e);
-    }
-  }
-
   /**
    * Returns a new pair of readers to the base and log files.
    */
-  private void openReadersIfNeeded() throws IOException {
-    if (reuse && (baseFileReader != null || logRecordScanner != null)) {
-      // quickly exit out without synchronizing if reusing and readers are already open
-      return;
-    }
-
-    // we always force synchronization, if reuse=false, to handle concurrent close() calls as well.
-    synchronized (this) {
-      if (baseFileReader != null || logRecordScanner != null) {
-        return;
-      }
-
-      final long baseFileOpenMs;
-      final long logScannerOpenMs;
-
-      // Metadata is in sync till the latest completed instant on the dataset
-      HoodieTimer timer = new HoodieTimer().startTimer();
-      String latestInstantTime = getLatestDatasetInstantTime();
-      ValidationUtils.checkArgument(latestFileSystemMetadataSlices.size() == 1, "must be at-least one valid metadata file slice");
-
-      // If the base file is present then create a reader
-      Option<HoodieBaseFile> basefile = latestFileSystemMetadataSlices.get(0).getBaseFile();
-      if (basefile.isPresent()) {
-        String basefilePath = basefile.get().getPath();
-        baseFileReader = HoodieFileReaderFactory.getFileReader(hadoopConf.get(), new Path(basefilePath));
-        baseFileOpenMs = timer.endTimer();
-        LOG.info(String.format("Opened metadata base file from %s at instant %s in %d ms", basefilePath,
-            basefile.get().getCommitTime(), baseFileOpenMs));
-      } else {
-        baseFileOpenMs = 0;
-        timer.endTimer();
-      }
-
-      // Open the log record scanner using the log files from the latest file slice
-      timer.startTimer();
-      List<String> logFilePaths = latestFileSystemMetadataSlices.get(0).getLogFiles()
-          .sorted(HoodieLogFile.getLogFileComparator())
-          .map(o -> o.getPath().toString())
-          .collect(Collectors.toList());
-      Option<HoodieInstant> lastInstant = metaClient.getActiveTimeline().filterCompletedInstants().lastInstant();
-      String latestMetaInstantTimestamp = lastInstant.map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP);
-
-      // Load the schema
-      Schema schema = HoodieAvroUtils.addMetadataFields(HoodieMetadataRecord.getClassSchema());
-      HoodieCommonConfig commonConfig = HoodieCommonConfig.newBuilder().fromProperties(metadataConfig.getProps()).build();
-      logRecordScanner = HoodieMetadataMergedLogRecordScanner.newBuilder()
-          .withFileSystem(metaClient.getFs())
-          .withBasePath(metadataBasePath)
-          .withLogFilePaths(logFilePaths)
-          .withReaderSchema(schema)
-          .withLatestInstantTime(latestMetaInstantTimestamp)
-          .withMaxMemorySizeInBytes(MAX_MEMORY_SIZE_IN_BYTES)
-          .withBufferSize(BUFFER_SIZE)
-          .withSpillableMapBasePath(spillableMapDirectory)
-          .withDiskMapType(commonConfig.getSpillableDiskMapType())
-          .withBitCaskDiskMapCompressionEnabled(commonConfig.isBitCaskDiskMapCompressionEnabled())
-          .build();
-
-      logScannerOpenMs = timer.endTimer();
-      LOG.info(String.format("Opened metadata log files from %s at instant (dataset instant=%s, metadata instant=%s) in %d ms",
-          logFilePaths, latestInstantTime, latestMetaInstantTimestamp, logScannerOpenMs));
-
-      metrics.ifPresent(metrics -> metrics.updateMetrics(HoodieMetadataMetrics.SCAN_STR, baseFileOpenMs + logScannerOpenMs));
-    }
-  }
+  private Pair<HoodieFileReader, HoodieMetadataMergedLogRecordScanner> openReadersIfNeeded(String key, String partitionName) throws IOException {

Review comment:
       nts: ensure all the reuse of these readers for timeline server etc is working really well

##########
File path: hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
##########
@@ -340,4 +297,77 @@ private static void processRollbackMetadata(HoodieRollbackMetadata rollbackMetad
 
     return records;
   }
+
+  /**
+   * Returns a list of commits which were rolled back as part of a Rollback or Restore operation.
+   *
+   * @param instant The Rollback operation to read
+   * @param timeline
+   */
+  public static List<String> getCommitsRolledback(HoodieInstant instant, HoodieActiveTimeline timeline) {
+    try {
+      if (instant.getAction().equals(HoodieTimeline.ROLLBACK_ACTION)) {
+        HoodieRollbackMetadata rollbackMetadata = TimelineMetadataUtils.deserializeHoodieRollbackMetadata(
+            timeline.getInstantDetails(instant).get());
+        return rollbackMetadata.getCommitsRollback();
+      }
+
+      List<String> commitsRolledback = new LinkedList<>();
+
+      if (instant.getAction().equals(HoodieTimeline.RESTORE_ACTION)) {
+        // Restore is made up of several rollbacks
+        HoodieRestoreMetadata restoreMetadata = TimelineMetadataUtils.deserializeHoodieRestoreMetadata(
+            timeline.getInstantDetails(instant).get());
+        restoreMetadata.getHoodieRestoreMetadata().values().forEach(rms -> {
+          rms.forEach(rm -> commitsRolledback.addAll(rm.getCommitsRollback()));
+        });
+      }
+
+      return commitsRolledback;
+    } catch (IOException e) {
+      throw new HoodieMetadataException("Error retrieving rollback commits for instant " + instant, e);
+    }
+  }
+
+  /**
+   * Map a key to a shard.
+   *
+   * Note: For hashing, the algorithm is same as String.hashCode() but is being defined here as hashCode()
+   * implementation is not guaranteed by the JVM to be consistent across JVM versions and implementations.
+   *
+   * @param str
+   * @return An integer hash of the given string
+   */
+  public static int keyToShard(String str, int numShards) {
+    int h = 0;
+    for (int i = 0; i < str.length(); ++i) {
+      h = 31 * h + str.charAt(i);
+    }
+
+    return Math.abs(Math.abs(h) % numShards);
+  }
+
+  /**
+   * Loads the list of shards for a partition of the Metadata Table.
+   *
+   * The list of shards is returned sorted in the correct order of shard index.
+   * @param metaClient
+   * @param partition The name of the partition whose shards are to be loaded.
+   * @return List of shards
+   */
+  public static List<FileSlice> loadPartitionShards(HoodieTableMetaClient metaClient, String partition) {

Review comment:
       lets streamline all this naming.

##########
File path: hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java
##########
@@ -162,157 +168,121 @@ private void initIfNeeded() {
       throw new HoodieIOException("Error merging records from metadata table for key :" + key, ioe);
     } finally {
       if (!reuse) {
-        closeOrThrow();
+        close(partitionName);
       }
     }
   }
 
-  private void openReadersIfNeededOrThrow() {
-    try {
-      openReadersIfNeeded();
-    } catch (IOException e) {
-      throw new HoodieIOException("Error opening readers to the Metadata Table: ", e);
-    }
-  }
-
   /**
    * Returns a new pair of readers to the base and log files.
    */
-  private void openReadersIfNeeded() throws IOException {
-    if (reuse && (baseFileReader != null || logRecordScanner != null)) {
-      // quickly exit out without synchronizing if reusing and readers are already open
-      return;
-    }
-
-    // we always force synchronization, if reuse=false, to handle concurrent close() calls as well.
-    synchronized (this) {
-      if (baseFileReader != null || logRecordScanner != null) {
-        return;
-      }
-
-      final long baseFileOpenMs;
-      final long logScannerOpenMs;
-
-      // Metadata is in sync till the latest completed instant on the dataset
-      HoodieTimer timer = new HoodieTimer().startTimer();
-      String latestInstantTime = getLatestDatasetInstantTime();
-      ValidationUtils.checkArgument(latestFileSystemMetadataSlices.size() == 1, "must be at-least one valid metadata file slice");
-
-      // If the base file is present then create a reader
-      Option<HoodieBaseFile> basefile = latestFileSystemMetadataSlices.get(0).getBaseFile();
-      if (basefile.isPresent()) {
-        String basefilePath = basefile.get().getPath();
-        baseFileReader = HoodieFileReaderFactory.getFileReader(hadoopConf.get(), new Path(basefilePath));
-        baseFileOpenMs = timer.endTimer();
-        LOG.info(String.format("Opened metadata base file from %s at instant %s in %d ms", basefilePath,
-            basefile.get().getCommitTime(), baseFileOpenMs));
-      } else {
-        baseFileOpenMs = 0;
-        timer.endTimer();
-      }
-
-      // Open the log record scanner using the log files from the latest file slice
-      timer.startTimer();
-      List<String> logFilePaths = latestFileSystemMetadataSlices.get(0).getLogFiles()
-          .sorted(HoodieLogFile.getLogFileComparator())
-          .map(o -> o.getPath().toString())
-          .collect(Collectors.toList());
-      Option<HoodieInstant> lastInstant = metaClient.getActiveTimeline().filterCompletedInstants().lastInstant();
-      String latestMetaInstantTimestamp = lastInstant.map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP);
-
-      // Load the schema
-      Schema schema = HoodieAvroUtils.addMetadataFields(HoodieMetadataRecord.getClassSchema());
-      HoodieCommonConfig commonConfig = HoodieCommonConfig.newBuilder().fromProperties(metadataConfig.getProps()).build();
-      logRecordScanner = HoodieMetadataMergedLogRecordScanner.newBuilder()
-          .withFileSystem(metaClient.getFs())
-          .withBasePath(metadataBasePath)
-          .withLogFilePaths(logFilePaths)
-          .withReaderSchema(schema)
-          .withLatestInstantTime(latestMetaInstantTimestamp)
-          .withMaxMemorySizeInBytes(MAX_MEMORY_SIZE_IN_BYTES)
-          .withBufferSize(BUFFER_SIZE)
-          .withSpillableMapBasePath(spillableMapDirectory)
-          .withDiskMapType(commonConfig.getSpillableDiskMapType())
-          .withBitCaskDiskMapCompressionEnabled(commonConfig.isBitCaskDiskMapCompressionEnabled())
-          .build();
-
-      logScannerOpenMs = timer.endTimer();
-      LOG.info(String.format("Opened metadata log files from %s at instant (dataset instant=%s, metadata instant=%s) in %d ms",
-          logFilePaths, latestInstantTime, latestMetaInstantTimestamp, logScannerOpenMs));
-
-      metrics.ifPresent(metrics -> metrics.updateMetrics(HoodieMetadataMetrics.SCAN_STR, baseFileOpenMs + logScannerOpenMs));
-    }
-  }
+  private Pair<HoodieFileReader, HoodieMetadataMergedLogRecordScanner> openReadersIfNeeded(String key, String partitionName) throws IOException {
+    return shardReaders.computeIfAbsent(partitionName, k -> {
+      try {
+        final long baseFileOpenMs;
+        final long logScannerOpenMs;
+        HoodieFileReader baseFileReader = null;
+        HoodieMetadataMergedLogRecordScanner logRecordScanner = null;
+
+        // Metadata is in sync till the latest completed instant on the dataset
+        HoodieTimer timer = new HoodieTimer().startTimer();
+        List<FileSlice> shards = HoodieTableMetadataUtil.loadPartitionShards(metaClient, partitionName);
+        ValidationUtils.checkArgument(shards.size() == 1, String.format("Invalid number of shards: found=%d, required=%d", shards.size(), 1));
+        final FileSlice slice = shards.get(HoodieTableMetadataUtil.keyToShard(key, shards.size()));
+
+        // If the base file is present then create a reader
+        Option<HoodieBaseFile> basefile = slice.getBaseFile();
+        if (basefile.isPresent()) {

Review comment:
       do we send initial data to log files? without any base? is this why we are creating the log files with empty delete block upfront?

##########
File path: hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/BaseFlinkCommitActionExecutor.java
##########
@@ -141,13 +141,14 @@ protected void commit(Option<Map<String, String>> extraMetadata, HoodieWriteMeta
     result.setWriteStats(writeStats);
     // Finalize write
     finalizeWrite(instantTime, writeStats, result);
-    syncTableMetadata();
     try {
       LOG.info("Committing " + instantTime + ", action Type " + getCommitActionType());
       HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
       HoodieCommitMetadata metadata = CommitUtils.buildMetadata(writeStats, result.getPartitionToReplaceFileIds(),
           extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType());
 
+      syncTableMetadata(metadata);

Review comment:
       For Flink, this code is still executed at the driver, right?

##########
File path: hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/FlinkUpgradeDowngrade.java
##########
@@ -43,7 +43,9 @@ public void run(HoodieTableMetaClient metaClient, HoodieTableVersion toVersion,
 
   @Override
   protected void upgrade(HoodieTableVersion fromVersion, HoodieTableVersion toVersion, String instantTime) {
-    if (fromVersion == HoodieTableVersion.ZERO && toVersion == HoodieTableVersion.ONE) {
+    if (fromVersion == HoodieTableVersion.ONE && toVersion == HoodieTableVersion.TWO) {
+      // TODO:

Review comment:
       +1 

##########
File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
##########
@@ -381,6 +387,57 @@ private boolean bootstrapFromFilesystem(HoodieEngineContext engineContext, Hoodi
     return partitionToFileStatus;
   }
 
+  /**
+   * Initialize shards for a partition.
+   *
+   * Each shard is a single log file with the following format:
+   *    <fileIdPrefix>ABCD
+   * where ABCD are digits. This allows up to 9999 shards.
+   *
+   * Example:
+   *    fc9f18eb-6049-4f47-bc51-23884bef0001
+   *    fc9f18eb-6049-4f47-bc51-23884bef0002

Review comment:
       if we use the 0000-9999 as a hash partition, then we cannot reuse that?

##########
File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/BaseActionExecutor.java
##########
@@ -46,4 +50,24 @@ public BaseActionExecutor(HoodieEngineContext context, HoodieWriteConfig config,
   }
 
   public abstract R execute();
+
+  protected final void syncTableMetadata(HoodieCommitMetadata metadata) {

Review comment:
       +1 

##########
File path: hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java
##########
@@ -162,157 +168,121 @@ private void initIfNeeded() {
       throw new HoodieIOException("Error merging records from metadata table for key :" + key, ioe);
     } finally {
       if (!reuse) {
-        closeOrThrow();
+        close(partitionName);
       }
     }
   }
 
-  private void openReadersIfNeededOrThrow() {
-    try {
-      openReadersIfNeeded();
-    } catch (IOException e) {
-      throw new HoodieIOException("Error opening readers to the Metadata Table: ", e);
-    }
-  }
-
   /**
    * Returns a new pair of readers to the base and log files.
    */
-  private void openReadersIfNeeded() throws IOException {
-    if (reuse && (baseFileReader != null || logRecordScanner != null)) {
-      // quickly exit out without synchronizing if reusing and readers are already open
-      return;
-    }
-
-    // we always force synchronization, if reuse=false, to handle concurrent close() calls as well.
-    synchronized (this) {
-      if (baseFileReader != null || logRecordScanner != null) {
-        return;
-      }
-
-      final long baseFileOpenMs;
-      final long logScannerOpenMs;
-
-      // Metadata is in sync till the latest completed instant on the dataset
-      HoodieTimer timer = new HoodieTimer().startTimer();
-      String latestInstantTime = getLatestDatasetInstantTime();
-      ValidationUtils.checkArgument(latestFileSystemMetadataSlices.size() == 1, "must be at-least one valid metadata file slice");
-
-      // If the base file is present then create a reader
-      Option<HoodieBaseFile> basefile = latestFileSystemMetadataSlices.get(0).getBaseFile();
-      if (basefile.isPresent()) {
-        String basefilePath = basefile.get().getPath();
-        baseFileReader = HoodieFileReaderFactory.getFileReader(hadoopConf.get(), new Path(basefilePath));
-        baseFileOpenMs = timer.endTimer();
-        LOG.info(String.format("Opened metadata base file from %s at instant %s in %d ms", basefilePath,
-            basefile.get().getCommitTime(), baseFileOpenMs));
-      } else {
-        baseFileOpenMs = 0;
-        timer.endTimer();
-      }
-
-      // Open the log record scanner using the log files from the latest file slice
-      timer.startTimer();
-      List<String> logFilePaths = latestFileSystemMetadataSlices.get(0).getLogFiles()
-          .sorted(HoodieLogFile.getLogFileComparator())
-          .map(o -> o.getPath().toString())
-          .collect(Collectors.toList());
-      Option<HoodieInstant> lastInstant = metaClient.getActiveTimeline().filterCompletedInstants().lastInstant();
-      String latestMetaInstantTimestamp = lastInstant.map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP);
-
-      // Load the schema
-      Schema schema = HoodieAvroUtils.addMetadataFields(HoodieMetadataRecord.getClassSchema());
-      HoodieCommonConfig commonConfig = HoodieCommonConfig.newBuilder().fromProperties(metadataConfig.getProps()).build();
-      logRecordScanner = HoodieMetadataMergedLogRecordScanner.newBuilder()
-          .withFileSystem(metaClient.getFs())
-          .withBasePath(metadataBasePath)
-          .withLogFilePaths(logFilePaths)
-          .withReaderSchema(schema)
-          .withLatestInstantTime(latestMetaInstantTimestamp)
-          .withMaxMemorySizeInBytes(MAX_MEMORY_SIZE_IN_BYTES)
-          .withBufferSize(BUFFER_SIZE)
-          .withSpillableMapBasePath(spillableMapDirectory)
-          .withDiskMapType(commonConfig.getSpillableDiskMapType())
-          .withBitCaskDiskMapCompressionEnabled(commonConfig.isBitCaskDiskMapCompressionEnabled())
-          .build();
-
-      logScannerOpenMs = timer.endTimer();
-      LOG.info(String.format("Opened metadata log files from %s at instant (dataset instant=%s, metadata instant=%s) in %d ms",
-          logFilePaths, latestInstantTime, latestMetaInstantTimestamp, logScannerOpenMs));
-
-      metrics.ifPresent(metrics -> metrics.updateMetrics(HoodieMetadataMetrics.SCAN_STR, baseFileOpenMs + logScannerOpenMs));
-    }
-  }
+  private Pair<HoodieFileReader, HoodieMetadataMergedLogRecordScanner> openReadersIfNeeded(String key, String partitionName) throws IOException {
+    return shardReaders.computeIfAbsent(partitionName, k -> {
+      try {
+        final long baseFileOpenMs;
+        final long logScannerOpenMs;
+        HoodieFileReader baseFileReader = null;
+        HoodieMetadataMergedLogRecordScanner logRecordScanner = null;
+
+        // Metadata is in sync till the latest completed instant on the dataset
+        HoodieTimer timer = new HoodieTimer().startTimer();
+        List<FileSlice> shards = HoodieTableMetadataUtil.loadPartitionShards(metaClient, partitionName);
+        ValidationUtils.checkArgument(shards.size() == 1, String.format("Invalid number of shards: found=%d, required=%d", shards.size(), 1));
+        final FileSlice slice = shards.get(HoodieTableMetadataUtil.keyToShard(key, shards.size()));
+
+        // If the base file is present then create a reader
+        Option<HoodieBaseFile> basefile = slice.getBaseFile();
+        if (basefile.isPresent()) {
+          String basefilePath = basefile.get().getPath();
+          baseFileReader = HoodieFileReaderFactory.getFileReader(hadoopConf.get(), new Path(basefilePath));
+          baseFileOpenMs = timer.endTimer();
+          LOG.info(String.format("Opened metadata base file from %s at instant %s in %d ms", basefilePath,
+              basefile.get().getCommitTime(), baseFileOpenMs));
+        } else {
+          baseFileOpenMs = 0;
+          timer.endTimer();
+        }
 
-  private void close(HoodieFileReader localFileReader, HoodieMetadataMergedLogRecordScanner localLogScanner) {
-    try {
-      if (localFileReader != null) {
-        localFileReader.close();
-      }
-      if (localLogScanner != null) {
-        localLogScanner.close();
+        // Open the log record scanner using the log files from the latest file slice
+        timer.startTimer();
+        List<String> logFilePaths = slice.getLogFiles()
+            .sorted(HoodieLogFile.getLogFileComparator())
+            .map(o -> o.getPath().toString())
+            .collect(Collectors.toList());
+
+        // Only those log files which have a corresponding completed instant on the dataset should be read
+        // This is because the metadata table is updated before the dataset instants are committed.
+        HoodieActiveTimeline datasetTimeline = datasetMetaClient.getActiveTimeline();
+        Set<String> validInstantTimestamps = datasetTimeline.filterCompletedInstants().getInstants()
+            .map(i -> i.getTimestamp()).collect(Collectors.toSet());
+
+        // For any rollbacks and restores, we cannot neglect the instants that they are rolling back.
+        // The rollback instant should be more recent than the start of the timeline for it to have rolled back any
+        // instant which we have a log block for.
+        final String minInstantTime = validInstantTimestamps.isEmpty() ? SOLO_COMMIT_TIMESTAMP : Collections.min(validInstantTimestamps);
+        datasetTimeline.getRollbackAndRestoreTimeline().filterCompletedInstants().getInstants()
+            .filter(instant -> HoodieTimeline.compareTimestamps(instant.getTimestamp(), HoodieTimeline.GREATER_THAN, minInstantTime))
+            .forEach(instant -> {
+              validInstantTimestamps.addAll(HoodieTableMetadataUtil.getCommitsRolledback(instant, datasetTimeline));
+            });
+
+        // SOLO_COMMIT_TIMESTAMP is used during bootstrap so it is a valid timestamp
+        validInstantTimestamps.add(SOLO_COMMIT_TIMESTAMP);
+
+        Option<HoodieInstant> lastInstant = metaClient.getActiveTimeline().filterCompletedInstants().lastInstant();
+        String latestMetaInstantTimestamp = lastInstant.map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP);
+
+        // Load the schema
+        Schema schema = HoodieAvroUtils.addMetadataFields(HoodieMetadataRecord.getClassSchema());
+        HoodieCommonConfig commonConfig = HoodieCommonConfig.newBuilder().fromProperties(metadataConfig.getProps()).build();
+        logRecordScanner = HoodieMetadataMergedLogRecordScanner.newBuilder()
+            .withFileSystem(metaClient.getFs())
+            .withBasePath(metadataBasePath)
+            .withLogFilePaths(logFilePaths)
+            .withReaderSchema(schema)
+            .withLatestInstantTime(latestMetaInstantTimestamp)
+            .withMaxMemorySizeInBytes(MAX_MEMORY_SIZE_IN_BYTES)
+            .withBufferSize(BUFFER_SIZE)
+            .withSpillableMapBasePath(spillableMapDirectory)
+            .withDiskMapType(commonConfig.getSpillableDiskMapType())
+            .withBitCaskDiskMapCompressionEnabled(commonConfig.isBitCaskDiskMapCompressionEnabled())
+            .withLogBlockTimestamps(validInstantTimestamps)

Review comment:
       this is what fences all uncommitted data from being read out of metadata table

##########
File path: hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java
##########
@@ -162,157 +168,121 @@ private void initIfNeeded() {
       throw new HoodieIOException("Error merging records from metadata table for key :" + key, ioe);
     } finally {
       if (!reuse) {
-        closeOrThrow();
+        close(partitionName);
       }
     }
   }
 
-  private void openReadersIfNeededOrThrow() {
-    try {
-      openReadersIfNeeded();
-    } catch (IOException e) {
-      throw new HoodieIOException("Error opening readers to the Metadata Table: ", e);
-    }
-  }
-
   /**
    * Returns a new pair of readers to the base and log files.
    */
-  private void openReadersIfNeeded() throws IOException {
-    if (reuse && (baseFileReader != null || logRecordScanner != null)) {
-      // quickly exit out without synchronizing if reusing and readers are already open
-      return;
-    }
-
-    // we always force synchronization, if reuse=false, to handle concurrent close() calls as well.
-    synchronized (this) {
-      if (baseFileReader != null || logRecordScanner != null) {
-        return;
-      }
-
-      final long baseFileOpenMs;
-      final long logScannerOpenMs;
-
-      // Metadata is in sync till the latest completed instant on the dataset
-      HoodieTimer timer = new HoodieTimer().startTimer();
-      String latestInstantTime = getLatestDatasetInstantTime();
-      ValidationUtils.checkArgument(latestFileSystemMetadataSlices.size() == 1, "must be at-least one valid metadata file slice");
-
-      // If the base file is present then create a reader
-      Option<HoodieBaseFile> basefile = latestFileSystemMetadataSlices.get(0).getBaseFile();
-      if (basefile.isPresent()) {
-        String basefilePath = basefile.get().getPath();
-        baseFileReader = HoodieFileReaderFactory.getFileReader(hadoopConf.get(), new Path(basefilePath));
-        baseFileOpenMs = timer.endTimer();
-        LOG.info(String.format("Opened metadata base file from %s at instant %s in %d ms", basefilePath,
-            basefile.get().getCommitTime(), baseFileOpenMs));
-      } else {
-        baseFileOpenMs = 0;
-        timer.endTimer();
-      }
-
-      // Open the log record scanner using the log files from the latest file slice
-      timer.startTimer();
-      List<String> logFilePaths = latestFileSystemMetadataSlices.get(0).getLogFiles()
-          .sorted(HoodieLogFile.getLogFileComparator())
-          .map(o -> o.getPath().toString())
-          .collect(Collectors.toList());
-      Option<HoodieInstant> lastInstant = metaClient.getActiveTimeline().filterCompletedInstants().lastInstant();
-      String latestMetaInstantTimestamp = lastInstant.map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP);
-
-      // Load the schema
-      Schema schema = HoodieAvroUtils.addMetadataFields(HoodieMetadataRecord.getClassSchema());
-      HoodieCommonConfig commonConfig = HoodieCommonConfig.newBuilder().fromProperties(metadataConfig.getProps()).build();
-      logRecordScanner = HoodieMetadataMergedLogRecordScanner.newBuilder()
-          .withFileSystem(metaClient.getFs())
-          .withBasePath(metadataBasePath)
-          .withLogFilePaths(logFilePaths)
-          .withReaderSchema(schema)
-          .withLatestInstantTime(latestMetaInstantTimestamp)
-          .withMaxMemorySizeInBytes(MAX_MEMORY_SIZE_IN_BYTES)
-          .withBufferSize(BUFFER_SIZE)
-          .withSpillableMapBasePath(spillableMapDirectory)
-          .withDiskMapType(commonConfig.getSpillableDiskMapType())
-          .withBitCaskDiskMapCompressionEnabled(commonConfig.isBitCaskDiskMapCompressionEnabled())
-          .build();
-
-      logScannerOpenMs = timer.endTimer();
-      LOG.info(String.format("Opened metadata log files from %s at instant (dataset instant=%s, metadata instant=%s) in %d ms",
-          logFilePaths, latestInstantTime, latestMetaInstantTimestamp, logScannerOpenMs));
-
-      metrics.ifPresent(metrics -> metrics.updateMetrics(HoodieMetadataMetrics.SCAN_STR, baseFileOpenMs + logScannerOpenMs));
-    }
-  }
+  private Pair<HoodieFileReader, HoodieMetadataMergedLogRecordScanner> openReadersIfNeeded(String key, String partitionName) throws IOException {

Review comment:
       this method needs to be broken down

##########
File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
##########
@@ -110,23 +119,31 @@ protected HoodieBackedTableMetadataWriter(Configuration hadoopConf, HoodieWriteC
       ValidationUtils.checkArgument(!this.metadataWriteConfig.useFileListingMetadata(), "File listing cannot be used for Metadata Table");
 
       initRegistry();
-      HoodieTableMetaClient datasetMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(datasetWriteConfig.getBasePath()).build();
-      initialize(engineContext, datasetMetaClient);
-      if (enabled) {
-        // This is always called even in case the table was created for the first time. This is because
-        // initFromFilesystem() does file listing and hence may take a long time during which some new updates
-        // may have occurred on the table. Hence, calling this always ensures that the metadata is brought in sync
-        // with the active timeline.
-        HoodieTimer timer = new HoodieTimer().startTimer();
-        syncFromInstants(datasetMetaClient);
-        metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.SYNC_STR, timer.endTimer()));
-      }
+      this.datasetMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(datasetWriteConfig.getBasePath()).build();
+      initTransactionManager();
+      initialize(engineContext);
+      initTableMetadata();
     } else {
       enabled = false;
       this.metrics = Option.empty();
     }
   }
 
+  /**
+   * Initialize the {@code TransactionManager} to use for metadata table.
+   *
+   * In HUDI multi writer mode, each operation will sync to metadata table before completion. Metadata table has common
+   * base and log files to update for each operation. So we can only support serialized operations.
+   */
+  private void initTransactionManager() {
+    // The lock location should be different from the dataset
+    Properties properties = new Properties();
+    properties.putAll(datasetWriteConfig.getProps());
+    properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, properties.getProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, datasetWriteConfig.getBasePath() + "/.hoodie/.locks") + "/metadata");

Review comment:
       +1 

##########
File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
##########
@@ -110,23 +119,31 @@ protected HoodieBackedTableMetadataWriter(Configuration hadoopConf, HoodieWriteC
       ValidationUtils.checkArgument(!this.metadataWriteConfig.useFileListingMetadata(), "File listing cannot be used for Metadata Table");
 
       initRegistry();
-      HoodieTableMetaClient datasetMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(datasetWriteConfig.getBasePath()).build();
-      initialize(engineContext, datasetMetaClient);
-      if (enabled) {
-        // This is always called even in case the table was created for the first time. This is because
-        // initFromFilesystem() does file listing and hence may take a long time during which some new updates
-        // may have occurred on the table. Hence, calling this always ensures that the metadata is brought in sync
-        // with the active timeline.
-        HoodieTimer timer = new HoodieTimer().startTimer();
-        syncFromInstants(datasetMetaClient);
-        metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.SYNC_STR, timer.endTimer()));
-      }
+      this.datasetMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(datasetWriteConfig.getBasePath()).build();
+      initTransactionManager();
+      initialize(engineContext);
+      initTableMetadata();
     } else {
       enabled = false;
       this.metrics = Option.empty();
     }
   }
 
+  /**
+   * Initialize the {@code TransactionManager} to use for metadata table.
+   *
+   * In HUDI multi writer mode, each operation will sync to metadata table before completion. Metadata table has common
+   * base and log files to update for each operation. So we can only support serialized operations.
+   */
+  private void initTransactionManager() {
+    // The lock location should be different from the dataset
+    Properties properties = new Properties();
+    properties.putAll(datasetWriteConfig.getProps());
+    properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, properties.getProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, datasetWriteConfig.getBasePath() + "/.hoodie/.locks") + "/metadata");

Review comment:
       filesystem based lock may not work on cloud storage. not sure if we can assume this.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org