You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by yi...@apache.org on 2022/06/28 19:27:40 UTC
[hudi] branch master updated: [HUDI-4320] Make sure `HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED` could be specified by the writer (#5970)

This is an automated email from the ASF dual-hosted git repository.

yihua pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
     new ed823f1c6f [HUDI-4320] Make sure `HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED` could be specified by the writer (#5970)
ed823f1c6f is described below

commit ed823f1c6f99fe1626a07b347d4ce063f47c42f8
Author: Alexey Kudinkin <al...@infinilake.com>
AuthorDate: Tue Jun 28 12:27:32 2022 -0700

    [HUDI-4320] Make sure `HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED` could be specified by the writer (#5970)
    
    Fixed sequence determining whether Parquet's legacy-format writing property should be overridden to only kick in when it has not been explicitly specified by the caller
---
 .../java/org/apache/hudi/util/DataTypeUtils.java   | 15 ++---
 .../main/java/org/apache/hudi/DataSourceUtils.java | 46 ++++++++++----
 .../java/org/apache/hudi/TestDataSourceUtils.java  | 74 +++++++++++++---------
 .../org/apache/hudi/internal/DefaultSource.java    |  4 +-
 .../apache/hudi/spark3/internal/DefaultSource.java |  4 +-
 5 files changed, 89 insertions(+), 54 deletions(-)

diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/util/DataTypeUtils.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/util/DataTypeUtils.java
index 1f4e8cc1dc..3ff7e1055c 100644
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/util/DataTypeUtils.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/util/DataTypeUtils.java
@@ -126,24 +126,23 @@ public class DataTypeUtils {
   }
 
   /**
-   * Try to find current sparktype whether contains that DecimalType which's scale < Decimal.MAX_LONG_DIGITS().
-   *
-   * @param sparkType spark schema.
-   * @return found result.
+   * Checks whether provided {@link DataType} contains {@link DecimalType} whose scale is less than
+   * {@link Decimal#MAX_LONG_DIGITS()}
    */
-  public static boolean foundSmallPrecisionDecimalType(DataType sparkType) {
+  public static boolean hasSmallPrecisionDecimalType(DataType sparkType) {
     if (sparkType instanceof StructType) {
       StructField[] fields = ((StructType) sparkType).fields();
-      return Arrays.stream(fields).anyMatch(f -> foundSmallPrecisionDecimalType(f.dataType()));
+      return Arrays.stream(fields).anyMatch(f -> hasSmallPrecisionDecimalType(f.dataType()));
     } else if (sparkType instanceof MapType) {
       MapType map = (MapType) sparkType;
-      return foundSmallPrecisionDecimalType(map.keyType()) || foundSmallPrecisionDecimalType(map.valueType());
+      return hasSmallPrecisionDecimalType(map.keyType()) || hasSmallPrecisionDecimalType(map.valueType());
     } else if (sparkType instanceof ArrayType) {
-      return foundSmallPrecisionDecimalType(((ArrayType) sparkType).elementType());
+      return hasSmallPrecisionDecimalType(((ArrayType) sparkType).elementType());
     } else if (sparkType instanceof DecimalType) {
       DecimalType decimalType = (DecimalType) sparkType;
       return decimalType.precision() < Decimal.MAX_LONG_DIGITS();
     }
+
     return false;
   }
 }
diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java
index 4042f431d7..c5b1857699 100644
--- a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java
+++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java
@@ -18,6 +18,9 @@
 
 package org.apache.hudi;
 
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
 import org.apache.hudi.client.HoodieReadClient;
 import org.apache.hudi.client.HoodieWriteResult;
 import org.apache.hudi.client.SparkRDDWriteClient;
@@ -45,10 +48,6 @@ import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor;
 import org.apache.hudi.sync.common.HoodieSyncConfig;
 import org.apache.hudi.table.BulkInsertPartitioner;
 import org.apache.hudi.util.DataTypeUtils;
-
-import org.apache.avro.generic.GenericRecord;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
 import org.apache.spark.api.java.JavaRDD;
@@ -327,14 +326,39 @@ public class DataSourceUtils {
     return hiveSyncConfig;
   }
 
-  // Now by default ParquetWriteSupport will write DecimalType to parquet as int32/int64 when the scale of decimalType < Decimal.MAX_LONG_DIGITS(),
-  // but AvroParquetReader which used by HoodieParquetReader cannot support read int32/int64 as DecimalType.
-  // try to find current schema whether contains that DecimalType, and auto set the value of "hoodie.parquet.writelegacyformat.enabled"
-  public static void mayBeOverwriteParquetWriteLegacyFormatProp(Map<String, String> properties, StructType schema) {
-    if (DataTypeUtils.foundSmallPrecisionDecimalType(schema)
-        && !Boolean.parseBoolean(properties.getOrDefault(HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED.key(), "false"))) {
+
+  /**
+   * Checks whether default value (false) of "hoodie.parquet.writelegacyformat.enabled" should be
+   * overridden in case:
+   *
+   * <ul>
+   *   <li>Property has not been explicitly set by the writer</li>
+   *   <li>Data schema contains {@code DecimalType} that would be affected by it</li>
+   * </ul>
+   *
+   * If both of the aforementioned conditions are true, will override the default value of the config
+   * (by essentially setting the value) to make sure that the produced Parquet data files could be
+   * read by {@code AvroParquetReader}
+   *
+   * @param properties properties specified by the writer
+   * @param schema schema of the dataset being written
+   */
+  public static void tryOverrideParquetWriteLegacyFormatProperty(Map<String, String> properties, StructType schema) {
+    if (DataTypeUtils.hasSmallPrecisionDecimalType(schema)
+        && properties.get(HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED.key()) == null) {
+      // ParquetWriteSupport writes DecimalType to parquet as INT32/INT64 when the scale of decimalType
+      // is less than {@code Decimal.MAX_LONG_DIGITS}, but {@code AvroParquetReader} which is used by
+      // {@code HoodieParquetReader} does not support DecimalType encoded as INT32/INT64 as.
+      //
+      // To work this problem around we're checking whether
+      //    - Schema contains any decimals that could be encoded as INT32/INT64
+      //    - {@code HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED} has not been explicitly
+      //    set by the writer
+      //
+      // If both of these conditions are true, than we override the default value of {@code
+      // HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED} and set it to "true"
+      LOG.warn("Small Decimal Type found in the persisted schema, reverting default value of 'hoodie.parquet.writelegacyformat.enabled' to true");
       properties.put(HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED.key(), "true");
-      LOG.warn("Small Decimal Type found in current schema, auto set the value of hoodie.parquet.writelegacyformat.enabled to true");
     }
   }
 }
diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java
index af5bbe7717..41c061d2bc 100644
--- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java
+++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java
@@ -18,6 +18,12 @@
 
 package org.apache.hudi;
 
+import org.apache.avro.Conversions;
+import org.apache.avro.LogicalTypes;
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericFixed;
+import org.apache.avro.generic.GenericRecord;
 import org.apache.hudi.avro.HoodieAvroUtils;
 import org.apache.hudi.client.SparkRDDWriteClient;
 import org.apache.hudi.common.config.TypedProperties;
@@ -27,21 +33,16 @@ import org.apache.hudi.common.model.WriteOperationType;
 import org.apache.hudi.common.util.Option;
 import org.apache.hudi.common.util.collection.ImmutablePair;
 import org.apache.hudi.config.HoodieClusteringConfig;
+import org.apache.hudi.config.HoodieStorageConfig;
 import org.apache.hudi.config.HoodieWriteConfig;
 import org.apache.hudi.exception.HoodieException;
 import org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner;
 import org.apache.hudi.hive.HiveSyncConfig;
 import org.apache.hudi.table.BulkInsertPartitioner;
-
-import org.apache.avro.Conversions;
-import org.apache.avro.LogicalTypes;
-import org.apache.avro.Schema;
-import org.apache.avro.generic.GenericData;
-import org.apache.avro.generic.GenericFixed;
-import org.apache.avro.generic.GenericRecord;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
+import org.apache.spark.sql.types.DecimalType;
 import org.apache.spark.sql.types.DecimalType$;
 import org.apache.spark.sql.types.Metadata;
 import org.apache.spark.sql.types.StructField;
@@ -51,7 +52,8 @@ import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.extension.ExtendWith;
 import org.junit.jupiter.params.ParameterizedTest;
-import org.junit.jupiter.params.provider.CsvSource;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
 import org.junit.jupiter.params.provider.ValueSource;
 import org.mockito.ArgumentCaptor;
 import org.mockito.Captor;
@@ -61,11 +63,13 @@ import org.mockito.junit.jupiter.MockitoExtension;
 import java.math.BigDecimal;
 import java.time.LocalDate;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.HashMap;
-import java.util.List;
 import java.util.Map;
+import java.util.stream.Stream;
 
-import static org.apache.hudi.DataSourceUtils.mayBeOverwriteParquetWriteLegacyFormatProp;
+import static org.apache.hudi.DataSourceUtils.tryOverrideParquetWriteLegacyFormatProperty;
 import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET;
 import static org.apache.hudi.hive.ddl.HiveSyncMode.HMS;
 import static org.hamcrest.CoreMatchers.containsString;
@@ -313,31 +317,39 @@ public class TestDataSourceUtils {
   }
 
   @ParameterizedTest
-  @CsvSource({"true, false", "true, true", "false, true", "false, false"})
-  public void testAutoModifyParquetWriteLegacyFormatParameter(boolean smallDecimal, boolean defaultWriteValue) {
-    // create test StructType
-    List<StructField> structFields = new ArrayList<>();
+  @MethodSource("testAutoModifyParquetWriteLegacyFormatParameterParams")
+  public void testAutoModifyParquetWriteLegacyFormatParameter(boolean smallDecimal, Boolean propValue, Boolean expectedPropValue) {
+    DecimalType decimalType;
     if (smallDecimal) {
-      structFields.add(StructField.apply("d1", DecimalType$.MODULE$.apply(10, 2), false, Metadata.empty()));
+      decimalType = DecimalType$.MODULE$.apply(10, 2);
     } else {
-      structFields.add(StructField.apply("d1", DecimalType$.MODULE$.apply(38, 10), false, Metadata.empty()));
+      decimalType = DecimalType$.MODULE$.apply(38, 10);
     }
-    StructType structType = StructType$.MODULE$.apply(structFields);
-    // create write options
-    Map<String, String> options = new HashMap<>();
-    options.put("hoodie.parquet.writelegacyformat.enabled", String.valueOf(defaultWriteValue));
 
-    // start test
-    mayBeOverwriteParquetWriteLegacyFormatProp(options, structType);
+    StructType structType = StructType$.MODULE$.apply(
+        Arrays.asList(
+            StructField.apply("d1", decimalType, false, Metadata.empty())
+        )
+    );
 
-    // check result
-    boolean res = Boolean.parseBoolean(options.get("hoodie.parquet.writelegacyformat.enabled"));
-    if (smallDecimal) {
-      // should auto modify "hoodie.parquet.writelegacyformat.enabled" = "true".
-      assertEquals(true, res);
-    } else {
-      // should not modify the value of "hoodie.parquet.writelegacyformat.enabled".
-      assertEquals(defaultWriteValue, res);
-    }
+    Map<String, String> options = propValue != null
+        ? Collections.singletonMap(HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED.key(), String.valueOf(propValue))
+        : new HashMap<>();
+
+    tryOverrideParquetWriteLegacyFormatProperty(options, structType);
+
+    Boolean finalPropValue =
+        Option.ofNullable(options.get(HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED.key()))
+            .map(Boolean::parseBoolean)
+            .orElse(null);
+    assertEquals(expectedPropValue, finalPropValue);
+  }
+
+  private static Stream<Arguments> testAutoModifyParquetWriteLegacyFormatParameterParams() {
+    return Arrays.stream(new Object[][] {
+        {true, null, true},   {false, null, null},
+        {true, false, false}, {true, true, true},
+        {false, true, true},  {false, false, false}
+    }).map(Arguments::of);
   }
 }
diff --git a/hudi-spark-datasource/hudi-spark2/src/main/java/org/apache/hudi/internal/DefaultSource.java b/hudi-spark-datasource/hudi-spark2/src/main/java/org/apache/hudi/internal/DefaultSource.java
index 4866a5be5c..3b3b8eafb8 100644
--- a/hudi-spark-datasource/hudi-spark2/src/main/java/org/apache/hudi/internal/DefaultSource.java
+++ b/hudi-spark-datasource/hudi-spark2/src/main/java/org/apache/hudi/internal/DefaultSource.java
@@ -36,7 +36,7 @@ import org.apache.spark.sql.types.StructType;
 import java.util.Map;
 import java.util.Optional;
 
-import static org.apache.hudi.DataSourceUtils.mayBeOverwriteParquetWriteLegacyFormatProp;
+import static org.apache.hudi.DataSourceUtils.tryOverrideParquetWriteLegacyFormatProperty;
 
 /**
  * DataSource V2 implementation for managing internal write logic. Only called internally.
@@ -69,7 +69,7 @@ public class DefaultSource extends BaseDefaultSource implements DataSourceV2,
         HoodieTableConfig.POPULATE_META_FIELDS.defaultValue());
     Map<String, String> properties = options.asMap();
     // Auto set the value of "hoodie.parquet.writelegacyformat.enabled"
-    mayBeOverwriteParquetWriteLegacyFormatProp(properties, schema);
+    tryOverrideParquetWriteLegacyFormatProperty(properties, schema);
     // 1st arg to createHoodieConfig is not really required to be set. but passing it anyways.
     HoodieWriteConfig config = DataSourceUtils.createHoodieConfig(options.get(HoodieWriteConfig.AVRO_SCHEMA_STRING.key()).get(), path, tblName, properties);
     boolean arePartitionRecordsSorted = HoodieInternalConfig.getBulkInsertIsPartitionRecordsSorted(
diff --git a/hudi-spark-datasource/hudi-spark3-common/src/main/java/org/apache/hudi/spark3/internal/DefaultSource.java b/hudi-spark-datasource/hudi-spark3-common/src/main/java/org/apache/hudi/spark3/internal/DefaultSource.java
index 4f7ff89a90..ab2f16703b 100644
--- a/hudi-spark-datasource/hudi-spark3-common/src/main/java/org/apache/hudi/spark3/internal/DefaultSource.java
+++ b/hudi-spark-datasource/hudi-spark3-common/src/main/java/org/apache/hudi/spark3/internal/DefaultSource.java
@@ -34,7 +34,7 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap;
 import java.util.HashMap;
 import java.util.Map;
 
-import static org.apache.hudi.DataSourceUtils.mayBeOverwriteParquetWriteLegacyFormatProp;
+import static org.apache.hudi.DataSourceUtils.tryOverrideParquetWriteLegacyFormatProperty;
 
 /**
  * DataSource V2 implementation for managing internal write logic. Only called internally.
@@ -59,7 +59,7 @@ public class DefaultSource extends BaseDefaultSource implements TableProvider {
     // Create a new map as the properties is an unmodifiableMap on Spark 3.2.0
     Map<String, String> newProps = new HashMap<>(properties);
     // Auto set the value of "hoodie.parquet.writelegacyformat.enabled"
-    mayBeOverwriteParquetWriteLegacyFormatProp(newProps, schema);
+    tryOverrideParquetWriteLegacyFormatProperty(newProps, schema);
     // 1st arg to createHoodieConfig is not really required to be set. but passing it anyways.
     HoodieWriteConfig config = DataSourceUtils.createHoodieConfig(newProps.get(HoodieWriteConfig.AVRO_SCHEMA_STRING.key()), path, tblName, newProps);
     return new HoodieDataSourceInternalTable(instantTime, config, schema, getSparkSession(),