You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2021/10/20 14:23:44 UTC

[GitHub] [hudi] nsivabalan commented on a change in pull request #3825: [HUDI-2472] Cleaner keep latest version policy test with metadata table enabled

nsivabalan commented on a change in pull request #3825:
URL: https://github.com/apache/hudi/pull/3825#discussion_r732809214



##########
File path: hudi-client/hudi-client-common/src/test/java/org/apache/hudi/common/testutils/HoodieMetadataTestTable.java
##########
@@ -56,11 +60,52 @@ public static HoodieTestTable of(HoodieTableMetaClient metaClient, HoodieTableMe
     return new HoodieMetadataTestTable(metaClient.getBasePath(), metaClient.getRawFs(), metaClient, writer);
   }
 
+  /**
+   * Add commits to the requested partitions and update metadata table.
+   *
+   * @param commitTime           - Commit time for the operation
+   * @param operationType        - Operation type
+   * @param newPartitionsToAdd   - New partitions to add for the operation
+   * @param partitions           - List of partitions for this operation
+   * @param filesPerPartition    - Total file count to create per partition
+   * @param bootstrap            - Whether bootstrapping needed for the operation
+   * @param createInflightCommit - Whether in flight commit needed for the operation
+   * @return Commit metadata for the commit operation performed.
+   * @throws Exception
+   */
   @Override
   public HoodieCommitMetadata doWriteOperation(String commitTime, WriteOperationType operationType,
                                                List<String> newPartitionsToAdd, List<String> partitions,
-                                               int filesPerPartition, boolean bootstrap, boolean createInflightCommit) throws Exception {
-    HoodieCommitMetadata commitMetadata = super.doWriteOperation(commitTime, operationType, newPartitionsToAdd, partitions, filesPerPartition, bootstrap, createInflightCommit);
+                                               int filesPerPartition, boolean bootstrap,
+                                               boolean createInflightCommit) throws Exception {
+    if (partitions.isEmpty()) {
+      partitions = Collections.singletonList(EMPTY_STRING);
+    }
+    Map<String, List<Pair<String, Integer>>> partitionToFilesNameLengthMap = getPartitionFiles(partitions,
+        filesPerPartition);
+    return this.doWriteOperation(commitTime, operationType, newPartitionsToAdd, partitionToFilesNameLengthMap,

Review comment:
       shouldn't we move this to HoodieTestTable. 
   We would like to keep only metadata writer related code here. 
   something like
   ```
   super.method()
   call writer if need be
   ```
   if writer is not involved, probably testTable is the right place to hold such methods.
   

##########
File path: hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestCleaner.java
##########
@@ -635,47 +638,61 @@ private void testFailedInsertAndCleanByCommits(
   }
 
   /**
-   * Test HoodieTable.clean() Cleaning by versions logic.
+   * Test Hudi COW Table Cleaner - Keep the latest file versions policy.
    */
   @ParameterizedTest
   @ValueSource(booleans = {false, true})
   public void testKeepLatestFileVersions(Boolean enableBootstrapSourceClean) throws Exception {
     HoodieWriteConfig config =
         HoodieWriteConfig.newBuilder().withPath(basePath)
-            .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).enable(false).build())
+            .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).enable(true).build())
             .withCompactionConfig(HoodieCompactionConfig.newBuilder()
                 .withCleanBootstrapBaseFileEnabled(enableBootstrapSourceClean)
                 .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build())
             .build();
-    HoodieTestTable testTable = HoodieTestTable.of(metaClient);
-    String p0 = "2020/01/01";
-    String p1 = "2020/01/02";
-    Map<String, List<BootstrapFileMapping>> bootstrapMapping = enableBootstrapSourceClean ? generateBootstrapIndexAndSourceData(p0, p1) : null;
+
+    HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context);
+    HoodieTestTable testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter);
+
+    final String p0 = "2020/01/01";
+    final String p1 = "2020/01/02";
+    final Map<String, List<BootstrapFileMapping>> bootstrapMapping = enableBootstrapSourceClean
+        ? generateBootstrapIndexAndSourceData(p0, p1) : null;
 
     // make 1 commit, with 1 file per partition
-    String file1P0C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p0).get(0).getFileId()
+    final String file1P0C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p0).get(0).getFileId()
         : UUID.randomUUID().toString();
-    String file1P1C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p1).get(0).getFileId()
+    final String file1P1C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p1).get(0).getFileId()
         : UUID.randomUUID().toString();
-    testTable.addCommit("00000000000001").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0);
+
+    Map<String, List<Pair<String, Integer>>> c1PartitionToFilesNameLengthMap = new HashMap<>();
+    c1PartitionToFilesNameLengthMap.put(p0, Collections.singletonList(Pair.of(file1P0C0, 100)));
+    c1PartitionToFilesNameLengthMap.put(p1, Collections.singletonList(Pair.of(file1P1C0, 200)));
+    testTable.doWriteOperation("00000000000001", WriteOperationType.INSERT, Arrays.asList(p0, p1),
+        c1PartitionToFilesNameLengthMap, false, false);
 
     List<HoodieCleanStat> hoodieCleanStatsOne = runCleaner(config);
     assertEquals(0, hoodieCleanStatsOne.size(), "Must not clean any files");
     assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
     assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
 
     // make next commit, with 1 insert & 1 update per partition
-    Map<String, String> partitionAndFileId002 = testTable.addCommit("00000000000002")
-        .withBaseFilesInPartition(p0, file1P0C0)
-        .withBaseFilesInPartition(p1, file1P1C0)
-        .getFileIdsWithBaseFilesInPartitions(p0, p1);
+    final String file2P0C1 = UUID.randomUUID().toString();
+    final String file2P1C1 = UUID.randomUUID().toString();
+    Map<String, List<Pair<String, Integer>>> c2PartitionToFilesNameLengthMap = new HashMap<>();
+    c2PartitionToFilesNameLengthMap.put(p0, Arrays.asList(Pair.of(file1P0C0, 101), Pair.of(file2P0C1, 100)));
+    c2PartitionToFilesNameLengthMap.put(p1, Arrays.asList(Pair.of(file1P1C0, 201), Pair.of(file2P1C1, 200)));
+    testTable.doWriteOperation("00000000000002", WriteOperationType.UPSERT, Collections.emptyList(),
+        c2PartitionToFilesNameLengthMap, false, false);
 
     List<HoodieCleanStat> hoodieCleanStatsTwo = runCleaner(config, 1);
-    // enableBootstrapSourceClean would delete the bootstrap base file as the same time
     HoodieCleanStat cleanStat = getCleanStat(hoodieCleanStatsTwo, p0);
+
+    // enableBootstrapSourceClean would delete the bootstrap base file at the same time

Review comment:
       guess this comment should go just before line 689




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org