You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by si...@apache.org on 2020/05/28 00:18:54 UTC

[hudi] 11/40: [HUDI-607] Fix to allow creation/syncing of Hive tables partitioned by Date type columns (#1330)

This is an automated email from the ASF dual-hosted git repository.

sivabalan pushed a commit to branch release-0.5.3
in repository https://gitbox.apache.org/repos/asf/hudi.git

commit 0050e00afdc728793f9928cbb594e657c7219c41
Author: Udit Mehrotra <um...@illinois.edu>
AuthorDate: Sun Mar 1 10:42:58 2020 -0800

    [HUDI-607] Fix to allow creation/syncing of Hive tables partitioned by Date type columns (#1330)
---
 .../main/java/org/apache/hudi/DataSourceUtils.java | 40 +++++++++++++-
 hudi-spark/src/test/java/DataSourceUtilsTest.java  | 61 ++++++++++++++++++++++
 2 files changed, 100 insertions(+), 1 deletion(-)

diff --git a/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java b/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java
index 475a925..99a795d 100644
--- a/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java
+++ b/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java
@@ -18,6 +18,8 @@
 
 package org.apache.hudi;
 
+import org.apache.avro.LogicalTypes;
+import org.apache.avro.Schema;
 import org.apache.hudi.client.HoodieReadClient;
 import org.apache.hudi.client.HoodieWriteClient;
 import org.apache.hudi.client.WriteStatus;
@@ -43,6 +45,7 @@ import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 
 import java.io.IOException;
+import java.time.LocalDate;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
@@ -78,7 +81,8 @@ public class DataSourceUtils {
 
       // return, if last part of name
       if (i == parts.length - 1) {
-        return val;
+        Schema fieldSchema = valueNode.getSchema().getField(part).schema();
+        return convertValueForSpecificDataTypes(fieldSchema, val);
       } else {
         // VC: Need a test here
         if (!(val instanceof GenericRecord)) {
@@ -98,6 +102,40 @@ public class DataSourceUtils {
   }
 
   /**
+   * This method converts values for fields with certain Avro/Parquet data types that require special handling.
+   *
+   * Logical Date Type is converted to actual Date value instead of Epoch Integer which is how it is
+   * represented/stored in parquet.
+   *
+   * @param fieldSchema avro field schema
+   * @param fieldValue avro field value
+   * @return field value either converted (for certain data types) or as it is.
+   */
+  private static Object convertValueForSpecificDataTypes(Schema fieldSchema, Object fieldValue) {
+    if (fieldSchema == null) {
+      return fieldValue;
+    }
+
+    if (isLogicalTypeDate(fieldSchema)) {
+      return LocalDate.ofEpochDay(Long.parseLong(fieldValue.toString()));
+    }
+    return fieldValue;
+  }
+
+  /**
+   * Given an Avro field schema checks whether the field is of Logical Date Type or not.
+   *
+   * @param fieldSchema avro field schema
+   * @return boolean indicating whether fieldSchema is of Avro's Date Logical Type
+   */
+  private static boolean isLogicalTypeDate(Schema fieldSchema) {
+    if (fieldSchema.getType() == Schema.Type.UNION) {
+      return fieldSchema.getTypes().stream().anyMatch(schema -> schema.getLogicalType() == LogicalTypes.date());
+    }
+    return fieldSchema.getLogicalType() == LogicalTypes.date();
+  }
+
+  /**
    * Create a key generator class via reflection, passing in any configs needed.
    * <p>
    * If the class name of key generator is configured through the properties file, i.e., {@code props}, use the
diff --git a/hudi-spark/src/test/java/DataSourceUtilsTest.java b/hudi-spark/src/test/java/DataSourceUtilsTest.java
new file mode 100644
index 0000000..4fe7547
--- /dev/null
+++ b/hudi-spark/src/test/java/DataSourceUtilsTest.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hudi.DataSourceUtils;
+import org.junit.Test;
+
+import java.time.LocalDate;
+
+import static org.junit.Assert.assertEquals;
+
+public class DataSourceUtilsTest {
+
+  @Test
+  public void testAvroRecordsFieldConversion() {
+    // There are fields event_date1, event_date2, event_date3 with logical type as Date. event_date1 & event_date3 are
+    // of UNION schema type, which is a union of null and date type in different orders. event_date2 is non-union
+    // date type
+    String avroSchemaString  = "{\"type\": \"record\"," + "\"name\": \"events\"," + "\"fields\": [ "
+        + "{\"name\": \"event_date1\", \"type\" : [{\"type\" : \"int\", \"logicalType\" : \"date\"}, \"null\"]},"
+        + "{\"name\": \"event_date2\", \"type\" : {\"type\": \"int\", \"logicalType\" : \"date\"}},"
+        + "{\"name\": \"event_date3\", \"type\" : [\"null\", {\"type\" : \"int\", \"logicalType\" : \"date\"}]},"
+        + "{\"name\": \"event_name\", \"type\": \"string\"},"
+        + "{\"name\": \"event_organizer\", \"type\": \"string\"}"
+        + "]}";
+
+    Schema avroSchema = new Schema.Parser().parse(avroSchemaString);
+    GenericRecord record = new GenericData.Record(avroSchema);
+    record.put("event_date1", 18000);
+    record.put("event_date2", 18001);
+    record.put("event_date3", 18002);
+    record.put("event_name", "Hudi Meetup");
+    record.put("event_organizer", "Hudi PMC");
+
+    assertEquals(LocalDate.ofEpochDay(18000).toString(), DataSourceUtils.getNestedFieldValAsString(record, "event_date1",
+        true));
+    assertEquals(LocalDate.ofEpochDay(18001).toString(), DataSourceUtils.getNestedFieldValAsString(record, "event_date2",
+        true));
+    assertEquals(LocalDate.ofEpochDay(18002).toString(), DataSourceUtils.getNestedFieldValAsString(record, "event_date3",
+        true));
+    assertEquals("Hudi Meetup", DataSourceUtils.getNestedFieldValAsString(record, "event_name", true));
+    assertEquals("Hudi PMC", DataSourceUtils.getNestedFieldValAsString(record, "event_organizer", true));
+  }
+}