You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2022/04/18 14:41:18 UTC

[GitHub] [hudi] nsivabalan commented on a diff in pull request #5342: [HUDI-3899] Drop index to delete pending index instants from timeline

nsivabalan commented on code in PR #5342:
URL: https://github.com/apache/hudi/pull/5342#discussion_r852164219


##########
hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieIndexer.java:
##########
@@ -132,29 +139,84 @@ public void testIndexerWithNotAllIndexesEnabled() {
     assertNoWriteErrors(statuses);
 
     // validate table config
-    assertTrue(HoodieTableMetadataUtil.getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath()));
-    assertTrue(HoodieTableMetadataUtil.getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(BLOOM_FILTERS.getPartitionPath()));
+    assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath()));
+    assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(BLOOM_FILTERS.getPartitionPath()));
 
     // build indexer config which has only column_stats enabled (files is enabled by default)
     HoodieIndexer.Config config = new HoodieIndexer.Config();
     String propsPath = Objects.requireNonNull(getClass().getClassLoader().getResource("delta-streamer-config/indexer.properties")).getPath();
     config.basePath = basePath;
     config.tableName = tableName;
     config.indexTypes = "COLUMN_STATS";
-    config.runningMode = "scheduleAndExecute";
+    config.runningMode = SCHEDULE_AND_EXECUTE;
     config.propsFilePath = propsPath;
     // start the indexer and validate column_stats index is also complete
     HoodieIndexer indexer = new HoodieIndexer(jsc, config);
     assertEquals(0, indexer.start(0));
 
     // validate table config
-    assertTrue(HoodieTableMetadataUtil.getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath()));
-    assertTrue(HoodieTableMetadataUtil.getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(BLOOM_FILTERS.getPartitionPath()));
-    assertTrue(HoodieTableMetadataUtil.getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(COLUMN_STATS.getPartitionPath()));
+    assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath()));
+    assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(BLOOM_FILTERS.getPartitionPath()));
+    assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(COLUMN_STATS.getPartitionPath()));
     // validate metadata partitions actually exist
-    assertTrue(HoodieTableMetadataUtil.metadataPartitionExists(basePath, context, FILES));
-    assertTrue(HoodieTableMetadataUtil.metadataPartitionExists(basePath, context, COLUMN_STATS));
-    assertTrue(HoodieTableMetadataUtil.metadataPartitionExists(basePath, context, BLOOM_FILTERS));
+    assertTrue(metadataPartitionExists(basePath, context, FILES));
+    assertTrue(metadataPartitionExists(basePath, context, COLUMN_STATS));
+    assertTrue(metadataPartitionExists(basePath, context, BLOOM_FILTERS));
+  }
+
+  @Test
+  public void testIndexerDropPartitionDeletesInstantFromTimeline() throws Exception {
+    initTestDataGenerator();
+    String tableName = "indexer_test";
+    HoodieWriteConfig.Builder writeConfigBuilder = getWriteConfigBuilder(basePath, tableName);
+    // enable files on the regular write client
+    HoodieMetadataConfig.Builder metadataConfigBuilder = getMetadataConfigBuilder(true, false).withMetadataIndexBloomFilter(true);
+    HoodieWriteConfig writeConfig = writeConfigBuilder.withMetadataConfig(metadataConfigBuilder.build()).build();
+    // do one upsert with synchronous metadata update
+    SparkRDDWriteClient writeClient = new SparkRDDWriteClient(context, writeConfig);
+    String instant = "0001";
+    writeClient.startCommitWithTime(instant);
+    List<HoodieRecord> records = dataGen.generateInserts(instant, 100);
+    JavaRDD<WriteStatus> result = writeClient.upsert(jsc.parallelize(records, 1), instant);
+    List<WriteStatus> statuses = result.collect();
+    assertNoWriteErrors(statuses);
+
+    // validate partitions built successfully
+    assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath()));
+    assertTrue(metadataPartitionExists(basePath, context, FILES));
+    assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(BLOOM_FILTERS.getPartitionPath()));
+    assertTrue(metadataPartitionExists(basePath, context, BLOOM_FILTERS));
+
+    // build indexer config which has only column_stats enabled (files is enabled by default)
+    HoodieIndexer.Config config = new HoodieIndexer.Config();
+    String propsPath = Objects.requireNonNull(getClass().getClassLoader().getResource("delta-streamer-config/indexer.properties")).getPath();
+    config.basePath = basePath;
+    config.tableName = tableName;
+    config.indexTypes = "COLUMN_STATS";
+    config.runningMode = SCHEDULE;
+    config.propsFilePath = propsPath;
+
+    // schedule indexing and validate column_stats index is also initialized
+    HoodieIndexer indexer = new HoodieIndexer(jsc, config);
+    assertEquals(0, indexer.start(0));
+    Option<HoodieInstant> indexInstantInTimeline = metaClient.reloadActiveTimeline().filterPendingIndexTimeline().lastInstant();
+    assertTrue(indexInstantInTimeline.isPresent());
+    assertEquals(REQUESTED, indexInstantInTimeline.get().getState());
+    assertTrue(metadataPartitionExists(basePath, context, COLUMN_STATS));
+
+    // drop column_stats and validate indexing.requested is also removed from the timeline
+    config.runningMode = DROP_INDEX;
+    indexer = new HoodieIndexer(jsc, config);
+    assertEquals(0, indexer.start(0));
+    indexInstantInTimeline = metaClient.reloadActiveTimeline().filterPendingIndexTimeline().lastInstant();
+    assertFalse(indexInstantInTimeline.isPresent());
+    assertFalse(metadataPartitionExists(basePath, context, COLUMN_STATS));
+
+    // check other partitions are intact
+    assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath()));

Review Comment:
   if we have any other tests for DropPartitions, can we add assertion that for a fully built out MDT partition, timeline files should not be removed. ie. if index instants are completed, dropPartition should not touch the timeline files. 
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org