You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2020/10/03 14:58:12 UTC

[GitHub] [hudi] xushiyan commented on a change in pull request #2127: [HUDI-284] add more test for UpdateSchemaEvolution

xushiyan commented on a change in pull request #2127:
URL: https://github.com/apache/hudi/pull/2127#discussion_r499154318



##########
File path: hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestUpdateSchemaEvolution.java
##########
@@ -68,61 +71,62 @@ public void tearDown() throws IOException {
     cleanupResources();
   }
 
-  @Test
-  public void testSchemaEvolutionOnUpdate() throws Exception {
+  private WriteStatus prepareFirstCommitData(List<String> recordsStrs) throws IOException {
     // Create a bunch of records with a old version of schema
     final HoodieWriteConfig config = makeHoodieClientConfig("/exampleSchema.txt");
     final HoodieSparkTable table = HoodieSparkTable.create(config, context);
-
     final List<WriteStatus> statuses = jsc.parallelize(Arrays.asList(1)).map(x -> {
-      String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
-          + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
-      String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
-          + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
-      String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
-          + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
       List<HoodieRecord> insertRecords = new ArrayList<>();
-      RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
-      insertRecords
-          .add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
-      RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
-      insertRecords
-          .add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
-      RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
-      insertRecords
-          .add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
-
+      for (String recordStr : recordsStrs) {
+        RawTripTestPayload rowChange = new RawTripTestPayload(recordStr);
+        insertRecords
+            .add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange));
+      }
       Map<String, HoodieRecord> insertRecordMap = insertRecords.stream()
           .collect(Collectors.toMap(r -> r.getRecordKey(), Function.identity()));
       HoodieCreateHandle createHandle =
-          new HoodieCreateHandle(config, "100", table, rowChange1.getPartitionPath(), "f1-0", insertRecordMap, supplier);
+          new HoodieCreateHandle(config, "100", table, insertRecords.get(0).getPartitionPath(), "f1-0", insertRecordMap, supplier);
       createHandle.write();
       return createHandle.close();
     }).collect();
 
     final Path commitFile = new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100"));
     FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()).create(commitFile);
+    return statuses.get(0);
+  }
+
+  @Test
+  public void testSchemaEvolutionOnUpdateSuccessWithAddColumnHaveDefault() throws Exception {
+    List<String> recordsStrs = new ArrayList<>();
+    String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
+        + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
+    String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
+        + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
+    String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+        + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
+    recordsStrs.add(recordStr1);
+    recordsStrs.add(recordStr2);
+    recordsStrs.add(recordStr3);

Review comment:
       these 3 local vars could be removed

##########
File path: hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestUpdateSchemaEvolution.java
##########
@@ -68,61 +71,62 @@ public void tearDown() throws IOException {
     cleanupResources();
   }
 
-  @Test
-  public void testSchemaEvolutionOnUpdate() throws Exception {
+  private WriteStatus prepareFirstCommitData(List<String> recordsStrs) throws IOException {
     // Create a bunch of records with a old version of schema
     final HoodieWriteConfig config = makeHoodieClientConfig("/exampleSchema.txt");
     final HoodieSparkTable table = HoodieSparkTable.create(config, context);
-
     final List<WriteStatus> statuses = jsc.parallelize(Arrays.asList(1)).map(x -> {
-      String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
-          + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
-      String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
-          + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
-      String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
-          + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
       List<HoodieRecord> insertRecords = new ArrayList<>();
-      RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
-      insertRecords
-          .add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
-      RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
-      insertRecords
-          .add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
-      RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
-      insertRecords
-          .add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
-
+      for (String recordStr : recordsStrs) {
+        RawTripTestPayload rowChange = new RawTripTestPayload(recordStr);
+        insertRecords
+            .add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange));
+      }
       Map<String, HoodieRecord> insertRecordMap = insertRecords.stream()
           .collect(Collectors.toMap(r -> r.getRecordKey(), Function.identity()));
       HoodieCreateHandle createHandle =
-          new HoodieCreateHandle(config, "100", table, rowChange1.getPartitionPath(), "f1-0", insertRecordMap, supplier);
+          new HoodieCreateHandle(config, "100", table, insertRecords.get(0).getPartitionPath(), "f1-0", insertRecordMap, supplier);
       createHandle.write();
       return createHandle.close();
     }).collect();
 
     final Path commitFile = new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100"));
     FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()).create(commitFile);
+    return statuses.get(0);
+  }
+
+  @Test
+  public void testSchemaEvolutionOnUpdateSuccessWithAddColumnHaveDefault() throws Exception {
+    List<String> recordsStrs = new ArrayList<>();
+    String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
+        + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
+    String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
+        + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
+    String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+        + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
+    recordsStrs.add(recordStr1);
+    recordsStrs.add(recordStr2);
+    recordsStrs.add(recordStr3);
 
     // Now try an update with an evolved schema
     // Evolved schema does not have guarantee on preserving the original field ordering
     final HoodieWriteConfig config2 = makeHoodieClientConfig("/exampleEvolvedSchema.txt");
-    final WriteStatus insertResult = statuses.get(0);
+    final WriteStatus insertResult = prepareFirstCommitData(recordsStrs);
     String fileId = insertResult.getFileId();
 
-    final HoodieSparkTable table2 = HoodieSparkTable.create(config, context);
+    final HoodieSparkTable table2 = HoodieSparkTable.create(config2, context);

Review comment:
       could we call them table and config instead of table2 and config2? since the method is split and they are the only vars left.

##########
File path: hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestUpdateSchemaEvolution.java
##########
@@ -68,61 +71,62 @@ public void tearDown() throws IOException {
     cleanupResources();
   }
 
-  @Test
-  public void testSchemaEvolutionOnUpdate() throws Exception {
+  private WriteStatus prepareFirstCommitData(List<String> recordsStrs) throws IOException {

Review comment:
       to avoid ambiguity, could we call this `prepareFirstRecordCommit()`?

##########
File path: hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestUpdateSchemaEvolution.java
##########
@@ -68,61 +71,62 @@ public void tearDown() throws IOException {
     cleanupResources();
   }
 
-  @Test
-  public void testSchemaEvolutionOnUpdate() throws Exception {
+  private WriteStatus prepareFirstCommitData(List<String> recordsStrs) throws IOException {
     // Create a bunch of records with a old version of schema
     final HoodieWriteConfig config = makeHoodieClientConfig("/exampleSchema.txt");
     final HoodieSparkTable table = HoodieSparkTable.create(config, context);
-
     final List<WriteStatus> statuses = jsc.parallelize(Arrays.asList(1)).map(x -> {
-      String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
-          + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
-      String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
-          + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
-      String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
-          + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
       List<HoodieRecord> insertRecords = new ArrayList<>();
-      RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
-      insertRecords
-          .add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
-      RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
-      insertRecords
-          .add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
-      RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
-      insertRecords
-          .add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
-
+      for (String recordStr : recordsStrs) {
+        RawTripTestPayload rowChange = new RawTripTestPayload(recordStr);
+        insertRecords
+            .add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange));
+      }
       Map<String, HoodieRecord> insertRecordMap = insertRecords.stream()
           .collect(Collectors.toMap(r -> r.getRecordKey(), Function.identity()));
       HoodieCreateHandle createHandle =
-          new HoodieCreateHandle(config, "100", table, rowChange1.getPartitionPath(), "f1-0", insertRecordMap, supplier);
+          new HoodieCreateHandle(config, "100", table, insertRecords.get(0).getPartitionPath(), "f1-0", insertRecordMap, supplier);
       createHandle.write();
       return createHandle.close();
     }).collect();
 
     final Path commitFile = new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100"));
     FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()).create(commitFile);
+    return statuses.get(0);
+  }
+
+  @Test
+  public void testSchemaEvolutionOnUpdateSuccessWithAddColumnHaveDefault() throws Exception {
+    List<String> recordsStrs = new ArrayList<>();
+    String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
+        + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
+    String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
+        + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
+    String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+        + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
+    recordsStrs.add(recordStr1);
+    recordsStrs.add(recordStr2);
+    recordsStrs.add(recordStr3);
 
     // Now try an update with an evolved schema
     // Evolved schema does not have guarantee on preserving the original field ordering
     final HoodieWriteConfig config2 = makeHoodieClientConfig("/exampleEvolvedSchema.txt");
-    final WriteStatus insertResult = statuses.get(0);
+    final WriteStatus insertResult = prepareFirstCommitData(recordsStrs);
     String fileId = insertResult.getFileId();
 
-    final HoodieSparkTable table2 = HoodieSparkTable.create(config, context);
+    final HoodieSparkTable table2 = HoodieSparkTable.create(config2, context);
     assertEquals(1, jsc.parallelize(Arrays.asList(1)).map(x -> {

Review comment:
       seeing this is original code. i don't see why it needs to assert 1 equals to 1 returned eventually. could we simplify this by removing `jsc.parallelize()` and just retain the lambda block?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org