You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by bh...@apache.org on 2020/05/04 18:27:23 UTC

[incubator-hudi] branch master updated: Add changes for presto mor queries (#1578)

This is an automated email from the ASF dual-hosted git repository.

bhavanisudha pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-hudi.git


The following commit(s) were added to refs/heads/master by this push:
     new e21441a  Add changes for presto mor queries (#1578)
e21441a is described below

commit e21441ad8317f302fed947c414e059a332e4d1ef
Author: bschell <bd...@gmail.com>
AuthorDate: Mon May 4 11:27:14 2020 -0700

    Add changes for presto mor queries (#1578)
    
    Adds the neccessary changes to hudi for support of presto querying hudi
    merge-on-read table's realtime view.
    
    Co-authored-by: Brandon Scheller <bs...@amazon.com>
---
 .../hadoop/UseRecordReaderFromInputFormat.java     | 38 ++++++++++++++++++++++
 .../realtime/HoodieParquetRealtimeInputFormat.java |  2 ++
 .../org/apache/hudi/hadoop/TestAnnotation.java     | 23 +++++++++++--
 packaging/hudi-presto-bundle/pom.xml               | 18 ++++++++++
 4 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/UseRecordReaderFromInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/UseRecordReaderFromInputFormat.java
new file mode 100644
index 0000000..fe87323
--- /dev/null
+++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/UseRecordReaderFromInputFormat.java
@@ -0,0 +1,38 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.hudi.hadoop;
+
+import java.lang.annotation.Documented;
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Inherited;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+
+/**
+* When annotated on a InputFormat, informs the query engines, that they should use the RecordReader provided by the input
+* format to execute the queries.
+*/
+@Inherited
+@Documented
+@Target(ElementType.TYPE)
+@Retention(RetentionPolicy.RUNTIME)
+public @interface UseRecordReaderFromInputFormat {
+
+}
\ No newline at end of file
diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java
index ce86807..ae3fb5c 100644
--- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java
+++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java
@@ -47,6 +47,7 @@ import org.apache.hadoop.mapred.InputSplit;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.RecordReader;
 import org.apache.hadoop.mapred.Reporter;
+import org.apache.hudi.hadoop.UseRecordReaderFromInputFormat;
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
 
@@ -63,6 +64,7 @@ import java.util.stream.Stream;
 /**
  * Input Format, that provides a real-time view of data in a Hoodie table.
  */
+@UseRecordReaderFromInputFormat
 @UseFileSplitsFromInputFormat
 public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat implements Configurable {
 
diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestAnnotation.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestAnnotation.java
index 15d0a6c..1f74c7a 100644
--- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestAnnotation.java
+++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestAnnotation.java
@@ -19,7 +19,7 @@
 package org.apache.hudi.hadoop;
 
 import org.junit.jupiter.api.Test;
-
+import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat;
 import java.lang.annotation.Annotation;
 
 import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -27,7 +27,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
 public class TestAnnotation {
 
   @Test
-  public void testAnnotation() {
+  public void testHoodieParquetInputFormatAnnotation() {
     assertTrue(HoodieParquetInputFormat.class.isAnnotationPresent(UseFileSplitsFromInputFormat.class));
     Annotation[] annotations = HoodieParquetInputFormat.class.getAnnotations();
     boolean found = false;
@@ -38,4 +38,23 @@ public class TestAnnotation {
     }
     assertTrue(found);
   }
+
+  @Test
+  public void testHoodieParquetRealtimeInputFormatAnnotations() {
+    assertTrue(HoodieParquetRealtimeInputFormat.class.isAnnotationPresent(UseFileSplitsFromInputFormat.class));
+    assertTrue(HoodieParquetRealtimeInputFormat.class.isAnnotationPresent(UseRecordReaderFromInputFormat.class));
+    Annotation[] annotations = HoodieParquetRealtimeInputFormat.class.getAnnotations();
+    boolean foundFileSplitsAnnotation = false;
+    boolean foundRecordReaderAnnotation = false;
+    for (Annotation annotation : annotations) {
+      if ("UseFileSplitsFromInputFormat".equals(annotation.annotationType().getSimpleName())) {
+        foundFileSplitsAnnotation = true;
+      }
+      if ("UseRecordReaderFromInputFormat".equals(annotation.annotationType().getSimpleName())) {
+        foundRecordReaderAnnotation = true;
+      }
+    }
+    assertTrue(foundFileSplitsAnnotation);
+    assertTrue(foundRecordReaderAnnotation);
+  }
 }
diff --git a/packaging/hudi-presto-bundle/pom.xml b/packaging/hudi-presto-bundle/pom.xml
index cccde22..c51c22b 100644
--- a/packaging/hudi-presto-bundle/pom.xml
+++ b/packaging/hudi-presto-bundle/pom.xml
@@ -68,6 +68,7 @@
                   <include>org.apache.hudi:hudi-hadoop-mr</include>
 
                   <include>org.apache.parquet:parquet-avro</include>
+                  <include>org.apache.avro:avro</include>
                   <include>com.esotericsoftware:kryo-shaded</include>
                   <include>org.objenesis:objenesis</include>
                   <include>com.esotericsoftware:minlog</include>
@@ -76,6 +77,10 @@
               <relocations>
 
                 <relocation>
+                  <pattern>org.apache.avro.</pattern>
+                  <shadedPattern>org.apache.hudi.org.apache.avro.</shadedPattern>
+                </relocation>
+                <relocation>
                   <pattern>com.esotericsoftware.kryo.</pattern>
                   <shadedPattern>org.apache.hudi.com.esotericsoftware.kryo.</shadedPattern>
                 </relocation>
@@ -128,5 +133,18 @@
       <artifactId>hudi-hadoop-mr-bundle</artifactId>
       <version>${project.version}</version>
     </dependency>
+
+    <!-- Parquet -->
+    <dependency>
+      <groupId>org.apache.parquet</groupId>
+      <artifactId>parquet-avro</artifactId>
+      <scope>compile</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.avro</groupId>
+      <artifactId>avro</artifactId>
+      <scope>compile</scope>
+    </dependency>
   </dependencies>
 </project>