You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by xu...@apache.org on 2023/02/28 02:21:31 UTC

[hudi] branch master updated: [HUDI-5853] Add infer functions to BQ sync configs (#8053)

This is an automated email from the ASF dual-hosted git repository.

xushiyan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
     new b6fdcb7ada2 [HUDI-5853] Add infer functions to BQ sync configs (#8053)
b6fdcb7ada2 is described below

commit b6fdcb7ada215f58ca9cecfd78d4e1c12bba573d
Author: Shiyan Xu <27...@users.noreply.github.com>
AuthorDate: Mon Feb 27 20:21:23 2023 -0600

    [HUDI-5853] Add infer functions to BQ sync configs (#8053)
---
 .../hudi/gcp/bigquery/BigQuerySyncConfig.java      | 24 +++++-
 .../hudi/gcp/bigquery/TestBigQuerySyncConfig.java  | 86 +++++++++++++++++++---
 2 files changed, 97 insertions(+), 13 deletions(-)

diff --git a/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/BigQuerySyncConfig.java b/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/BigQuerySyncConfig.java
index 8c04c0fa136..2c06127623c 100644
--- a/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/BigQuerySyncConfig.java
+++ b/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/BigQuerySyncConfig.java
@@ -22,7 +22,11 @@ package org.apache.hudi.gcp.bigquery;
 import org.apache.hudi.common.config.ConfigClassProperty;
 import org.apache.hudi.common.config.ConfigGroups;
 import org.apache.hudi.common.config.ConfigProperty;
+import org.apache.hudi.common.config.HoodieMetadataConfig;
 import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.table.HoodieTableConfig;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
 import org.apache.hudi.sync.common.HoodieSyncConfig;
 
 import com.beust.jcommander.Parameter;
@@ -33,6 +37,11 @@ import javax.annotation.concurrent.Immutable;
 import java.io.Serializable;
 import java.util.Properties;
 
+import static org.apache.hudi.common.config.HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS;
+import static org.apache.hudi.common.table.HoodieTableConfig.DATABASE_NAME;
+import static org.apache.hudi.common.table.HoodieTableConfig.HOODIE_TABLE_NAME_KEY;
+import static org.apache.hudi.common.table.HoodieTableConfig.HOODIE_WRITE_TABLE_NAME_KEY;
+
 /**
  * Configs needed to sync data into BigQuery.
  */
@@ -50,6 +59,7 @@ public class BigQuerySyncConfig extends HoodieSyncConfig implements Serializable
   public static final ConfigProperty<String> BIGQUERY_SYNC_DATASET_NAME = ConfigProperty
       .key("hoodie.gcp.bigquery.sync.dataset_name")
       .noDefaultValue()
+      .withInferFunction(cfg -> Option.ofNullable(cfg.getString(DATABASE_NAME)))
       .withDocumentation("Name of the target dataset in BigQuery");
 
   public static final ConfigProperty<String> BIGQUERY_SYNC_DATASET_LOCATION = ConfigProperty
@@ -60,6 +70,8 @@ public class BigQuerySyncConfig extends HoodieSyncConfig implements Serializable
   public static final ConfigProperty<String> BIGQUERY_SYNC_TABLE_NAME = ConfigProperty
       .key("hoodie.gcp.bigquery.sync.table_name")
       .noDefaultValue()
+      .withInferFunction(cfg -> Option.ofNullable(cfg.getString(HOODIE_TABLE_NAME_KEY))
+          .or(() -> Option.ofNullable(cfg.getString(HOODIE_WRITE_TABLE_NAME_KEY))))
       .withDocumentation("Name of the target table in BigQuery");
 
   public static final ConfigProperty<String> BIGQUERY_SYNC_SOURCE_URI = ConfigProperty
@@ -75,26 +87,32 @@ public class BigQuerySyncConfig extends HoodieSyncConfig implements Serializable
   public static final ConfigProperty<String> BIGQUERY_SYNC_SYNC_BASE_PATH = ConfigProperty
       .key("hoodie.gcp.bigquery.sync.base_path")
       .noDefaultValue()
+      .withInferFunction(cfg -> Option.ofNullable(cfg.getString(META_SYNC_BASE_PATH)))
       .withDocumentation("Base path of the hoodie table to sync");
 
   public static final ConfigProperty<String> BIGQUERY_SYNC_PARTITION_FIELDS = ConfigProperty
       .key("hoodie.gcp.bigquery.sync.partition_fields")
       .noDefaultValue()
+      .withInferFunction(cfg -> Option.ofNullable(cfg.getString(HoodieTableConfig.PARTITION_FIELDS))
+          .or(() -> Option.ofNullable(cfg.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME))))
       .withDocumentation("Comma-delimited partition fields. Default to non-partitioned.");
 
   public static final ConfigProperty<Boolean> BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA = ConfigProperty
       .key("hoodie.gcp.bigquery.sync.use_file_listing_from_metadata")
-      .defaultValue(false)
+      .defaultValue(DEFAULT_METADATA_ENABLE_FOR_READERS)
+      .withInferFunction(cfg -> Option.of(cfg.getBooleanOrDefault(HoodieMetadataConfig.ENABLE, DEFAULT_METADATA_ENABLE_FOR_READERS)))
       .withDocumentation("Fetch file listing from Hudi's metadata");
 
-  public static final ConfigProperty<Boolean> BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING = ConfigProperty
+  public static final ConfigProperty<String> BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING = ConfigProperty
       .key("hoodie.gcp.bigquery.sync.assume_date_partitioning")
-      .defaultValue(false)
+      .defaultValue(HoodieMetadataConfig.ASSUME_DATE_PARTITIONING.defaultValue())
+      .withInferFunction(cfg -> Option.ofNullable(cfg.getString(HoodieMetadataConfig.ASSUME_DATE_PARTITIONING)))
       .withDocumentation("Assume standard yyyy/mm/dd partitioning, this"
           + " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter");
 
   public BigQuerySyncConfig(Properties props) {
     super(props);
+    setDefaults(BigQuerySyncConfig.class.getName());
   }
 
   public static class BigQuerySyncConfigParams {
diff --git a/hudi-gcp/src/test/java/org/apache/hudi/gcp/bigquery/TestBigQuerySyncConfig.java b/hudi-gcp/src/test/java/org/apache/hudi/gcp/bigquery/TestBigQuerySyncConfig.java
index 82a85277384..6d96e9b9123 100644
--- a/hudi-gcp/src/test/java/org/apache/hudi/gcp/bigquery/TestBigQuerySyncConfig.java
+++ b/hudi-gcp/src/test/java/org/apache/hudi/gcp/bigquery/TestBigQuerySyncConfig.java
@@ -19,12 +19,16 @@
 
 package org.apache.hudi.gcp.bigquery;
 
-import org.junit.jupiter.api.BeforeEach;
+import org.apache.hudi.common.config.HoodieMetadataConfig;
+import org.apache.hudi.common.table.HoodieTableConfig;
+import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
+
 import org.junit.jupiter.api.Test;
 
 import java.util.Arrays;
 import java.util.Properties;
 
+import static org.apache.hudi.common.config.HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS;
 import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING;
 import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_DATASET_LOCATION;
 import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_DATASET_NAME;
@@ -36,13 +40,12 @@ import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_SYNC
 import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_TABLE_NAME;
 import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA;
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
 
 public class TestBigQuerySyncConfig {
 
-  BigQuerySyncConfig syncConfig;
-
-  @BeforeEach
-  void setUp() {
+  @Test
+  public void testGetConfigs() {
     Properties props = new Properties();
     props.setProperty(BIGQUERY_SYNC_PROJECT_ID.key(), "fooproject");
     props.setProperty(BIGQUERY_SYNC_DATASET_NAME.key(), "foodataset");
@@ -54,11 +57,7 @@ public class TestBigQuerySyncConfig {
     props.setProperty(BIGQUERY_SYNC_PARTITION_FIELDS.key(), "a,b");
     props.setProperty(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA.key(), "true");
     props.setProperty(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING.key(), "true");
-    syncConfig = new BigQuerySyncConfig(props);
-  }
-
-  @Test
-  public void testGetConfigs() {
+    BigQuerySyncConfig syncConfig = new BigQuerySyncConfig(props);
     assertEquals("fooproject", syncConfig.getString(BIGQUERY_SYNC_PROJECT_ID));
     assertEquals("foodataset", syncConfig.getString(BIGQUERY_SYNC_DATASET_NAME));
     assertEquals("US", syncConfig.getString(BIGQUERY_SYNC_DATASET_LOCATION));
@@ -71,4 +70,71 @@ public class TestBigQuerySyncConfig {
     assertEquals(true, syncConfig.getBoolean(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING));
   }
 
+  @Test
+  public void testInferDatasetAndTableNames() {
+    Properties props1 = new Properties();
+    props1.setProperty(HoodieTableConfig.DATABASE_NAME.key(), "db1");
+    props1.setProperty(HoodieTableConfig.HOODIE_TABLE_NAME_KEY, "tbl1");
+    BigQuerySyncConfig config1 = new BigQuerySyncConfig(props1);
+    assertEquals("db1", config1.getString(BIGQUERY_SYNC_DATASET_NAME));
+    assertEquals("tbl1", config1.getString(BIGQUERY_SYNC_TABLE_NAME));
+
+    Properties props2 = new Properties();
+    props2.setProperty(HoodieTableConfig.DATABASE_NAME.key(), "db2");
+    props2.setProperty(HoodieTableConfig.HOODIE_WRITE_TABLE_NAME_KEY, "tbl2");
+    BigQuerySyncConfig config2 = new BigQuerySyncConfig(props2);
+    assertEquals("db2", config2.getString(BIGQUERY_SYNC_DATASET_NAME));
+    assertEquals("tbl2", config2.getString(BIGQUERY_SYNC_TABLE_NAME));
+  }
+
+  @Test
+  public void testInferPartitionFields() {
+    Properties props0 = new Properties();
+    BigQuerySyncConfig config0 = new BigQuerySyncConfig(props0);
+    assertNull(config0.getString(BIGQUERY_SYNC_PARTITION_FIELDS),
+        String.format("should get null due to absence of both %s and %s",
+            HoodieTableConfig.PARTITION_FIELDS.key(), KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()));
+
+    Properties props1 = new Properties();
+    props1.setProperty(HoodieTableConfig.PARTITION_FIELDS.key(), "foo,bar,baz");
+    BigQuerySyncConfig config1 = new BigQuerySyncConfig(props1);
+    assertEquals("foo,bar,baz", config1.getString(BIGQUERY_SYNC_PARTITION_FIELDS),
+        String.format("should infer from %s", HoodieTableConfig.PARTITION_FIELDS.key()));
+
+    Properties props2 = new Properties();
+    props2.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "foo,bar");
+    BigQuerySyncConfig config2 = new BigQuerySyncConfig(props2);
+    assertEquals("foo,bar", config2.getString(BIGQUERY_SYNC_PARTITION_FIELDS),
+        String.format("should infer from %s", KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()));
+
+    Properties props3 = new Properties();
+    props3.setProperty(HoodieTableConfig.PARTITION_FIELDS.key(), "foo,bar,baz");
+    props3.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "foo,bar");
+    BigQuerySyncConfig config3 = new BigQuerySyncConfig(props3);
+    assertEquals("foo,bar,baz", config3.getString(BIGQUERY_SYNC_PARTITION_FIELDS),
+        String.format("should infer from %s, which has higher precedence.", HoodieTableConfig.PARTITION_FIELDS.key()));
+
+  }
+
+  @Test
+  void testInferUseFileListingFromMetadata() {
+    BigQuerySyncConfig config1 = new BigQuerySyncConfig(new Properties());
+    assertEquals(DEFAULT_METADATA_ENABLE_FOR_READERS, config1.getBoolean(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA));
+
+    Properties props2 = new Properties();
+    props2.setProperty(HoodieMetadataConfig.ENABLE.key(), "true");
+    BigQuerySyncConfig config2 = new BigQuerySyncConfig(props2);
+    assertEquals(true, config2.getBoolean(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA));
+  }
+
+  @Test
+  void testInferAssumeDatePartition() {
+    BigQuerySyncConfig config1 = new BigQuerySyncConfig(new Properties());
+    assertEquals(false, config1.getBoolean(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING));
+
+    Properties props2 = new Properties();
+    props2.setProperty(HoodieMetadataConfig.ASSUME_DATE_PARTITIONING.key(), "true");
+    BigQuerySyncConfig config2 = new BigQuerySyncConfig(props2);
+    assertEquals(true, config2.getBoolean(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING));
+  }
 }