You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by xu...@apache.org on 2022/07/27 21:58:35 UTC

[hudi] branch master updated: [HUDI-4126] Disable file splits for Bootstrap real time queries (via InputFormat) (#6219)

This is an automated email from the ASF dual-hosted git repository.

xushiyan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
     new 51599af281 [HUDI-4126] Disable file splits for Bootstrap real time queries (via InputFormat) (#6219)
51599af281 is described below

commit 51599af2818562b6cea9bd01bd81af363209a2d2
Author: Rahil C <32...@users.noreply.github.com>
AuthorDate: Wed Jul 27 14:58:29 2022 -0700

    [HUDI-4126] Disable file splits for Bootstrap real time queries (via InputFormat) (#6219)
    
    
    Co-authored-by: Udit Mehrotra <ud...@amazon.com>
    Co-authored-by: Raymond Xu <27...@users.noreply.github.com>
---
 .../hudi/hadoop/realtime/HoodieRealtimePath.java   |  2 +-
 .../TestHoodieCopyOnWriteTableInputFormat.java     | 60 +++++++++++++++++++
 .../TestHoodieMergeOnReadTableInputFormat.java     | 68 ++++++++++++++++++++++
 3 files changed, 129 insertions(+), 1 deletion(-)

diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimePath.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimePath.java
index bba44d5c66..1f1dd1b927 100644
--- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimePath.java
+++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimePath.java
@@ -89,7 +89,7 @@ public class HoodieRealtimePath extends Path {
   }
 
   public boolean isSplitable() {
-    return !toString().isEmpty();
+    return !toString().isEmpty() && !includeBootstrapFilePath();
   }
 
   public PathWithBootstrapFileStatus getPathWithBootstrapFileStatus() {
diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieCopyOnWriteTableInputFormat.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieCopyOnWriteTableInputFormat.java
new file mode 100644
index 0000000000..902778ed1c
--- /dev/null
+++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieCopyOnWriteTableInputFormat.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.hadoop;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.io.IOException;
+import java.net.URI;
+import java.nio.file.Files;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+
+public class TestHoodieCopyOnWriteTableInputFormat {
+
+  @TempDir
+  java.nio.file.Path tempDir;
+  private FileSystem fs;
+
+  @BeforeEach
+  void setUp() throws IOException {
+    fs = FileSystem.get(tempDir.toUri(), new Configuration());
+  }
+
+  @AfterEach
+  void tearDown() throws IOException {
+    fs.close();
+  }
+
+  @Test
+  void pathNotSplitableForBootstrapScenario() throws IOException {
+    URI source = Files.createTempFile(tempDir, "source", ".parquet").toUri();
+    URI target = Files.createTempFile(tempDir, "target", ".parquet").toUri();
+    PathWithBootstrapFileStatus path = new PathWithBootstrapFileStatus(new Path(target), fs.getFileStatus(new Path(source)));
+    HoodieCopyOnWriteTableInputFormat cowInputFormat = new HoodieCopyOnWriteTableInputFormat();
+    assertFalse(cowInputFormat.isSplitable(fs, path), "Path for bootstrap should not be splitable.");
+  }
+}
diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieMergeOnReadTableInputFormat.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieMergeOnReadTableInputFormat.java
new file mode 100644
index 0000000000..d44f5fbf63
--- /dev/null
+++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieMergeOnReadTableInputFormat.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.hadoop.realtime;
+
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.hadoop.PathWithBootstrapFileStatus;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.io.IOException;
+import java.net.URI;
+import java.nio.file.Files;
+import java.util.Collections;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class TestHoodieMergeOnReadTableInputFormat {
+
+  @TempDir
+  java.nio.file.Path tempDir;
+  private FileSystem fs;
+
+  @BeforeEach
+  void setUp() throws IOException {
+    fs = FileSystem.get(tempDir.toUri(), new Configuration());
+  }
+
+  @AfterEach
+  void tearDown() throws IOException {
+    fs.close();
+  }
+
+  @Test
+  void pathNotSplitableForBootstrapScenario() throws IOException {
+    URI source = Files.createTempFile(tempDir, "source", ".parquet").toUri();
+    URI target = Files.createTempFile(tempDir, "target", ".parquet").toUri();
+    HoodieRealtimePath rtPath = new HoodieRealtimePath(new Path("foo"), "bar", target.toString(), Collections.emptyList(), "000", false, Option.empty());
+    assertTrue(new HoodieMergeOnReadTableInputFormat().isSplitable(fs, rtPath));
+
+    PathWithBootstrapFileStatus path = new PathWithBootstrapFileStatus(new Path(target), fs.getFileStatus(new Path(source)));
+    rtPath.setPathWithBootstrapFileStatus(path);
+    assertFalse(new HoodieMergeOnReadTableInputFormat().isSplitable(fs, rtPath), "Path for bootstrap should not be splitable.");
+  }
+}