You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by xu...@apache.org on 2023/02/28 02:21:31 UTC
[hudi] branch master updated: [HUDI-5853] Add infer functions to BQ sync configs (#8053)
This is an automated email from the ASF dual-hosted git repository.
xushiyan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new b6fdcb7ada2 [HUDI-5853] Add infer functions to BQ sync configs (#8053)
b6fdcb7ada2 is described below
commit b6fdcb7ada215f58ca9cecfd78d4e1c12bba573d
Author: Shiyan Xu <27...@users.noreply.github.com>
AuthorDate: Mon Feb 27 20:21:23 2023 -0600
[HUDI-5853] Add infer functions to BQ sync configs (#8053)
---
.../hudi/gcp/bigquery/BigQuerySyncConfig.java | 24 +++++-
.../hudi/gcp/bigquery/TestBigQuerySyncConfig.java | 86 +++++++++++++++++++---
2 files changed, 97 insertions(+), 13 deletions(-)
diff --git a/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/BigQuerySyncConfig.java b/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/BigQuerySyncConfig.java
index 8c04c0fa136..2c06127623c 100644
--- a/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/BigQuerySyncConfig.java
+++ b/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/BigQuerySyncConfig.java
@@ -22,7 +22,11 @@ package org.apache.hudi.gcp.bigquery;
import org.apache.hudi.common.config.ConfigClassProperty;
import org.apache.hudi.common.config.ConfigGroups;
import org.apache.hudi.common.config.ConfigProperty;
+import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.table.HoodieTableConfig;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
import org.apache.hudi.sync.common.HoodieSyncConfig;
import com.beust.jcommander.Parameter;
@@ -33,6 +37,11 @@ import javax.annotation.concurrent.Immutable;
import java.io.Serializable;
import java.util.Properties;
+import static org.apache.hudi.common.config.HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS;
+import static org.apache.hudi.common.table.HoodieTableConfig.DATABASE_NAME;
+import static org.apache.hudi.common.table.HoodieTableConfig.HOODIE_TABLE_NAME_KEY;
+import static org.apache.hudi.common.table.HoodieTableConfig.HOODIE_WRITE_TABLE_NAME_KEY;
+
/**
* Configs needed to sync data into BigQuery.
*/
@@ -50,6 +59,7 @@ public class BigQuerySyncConfig extends HoodieSyncConfig implements Serializable
public static final ConfigProperty<String> BIGQUERY_SYNC_DATASET_NAME = ConfigProperty
.key("hoodie.gcp.bigquery.sync.dataset_name")
.noDefaultValue()
+ .withInferFunction(cfg -> Option.ofNullable(cfg.getString(DATABASE_NAME)))
.withDocumentation("Name of the target dataset in BigQuery");
public static final ConfigProperty<String> BIGQUERY_SYNC_DATASET_LOCATION = ConfigProperty
@@ -60,6 +70,8 @@ public class BigQuerySyncConfig extends HoodieSyncConfig implements Serializable
public static final ConfigProperty<String> BIGQUERY_SYNC_TABLE_NAME = ConfigProperty
.key("hoodie.gcp.bigquery.sync.table_name")
.noDefaultValue()
+ .withInferFunction(cfg -> Option.ofNullable(cfg.getString(HOODIE_TABLE_NAME_KEY))
+ .or(() -> Option.ofNullable(cfg.getString(HOODIE_WRITE_TABLE_NAME_KEY))))
.withDocumentation("Name of the target table in BigQuery");
public static final ConfigProperty<String> BIGQUERY_SYNC_SOURCE_URI = ConfigProperty
@@ -75,26 +87,32 @@ public class BigQuerySyncConfig extends HoodieSyncConfig implements Serializable
public static final ConfigProperty<String> BIGQUERY_SYNC_SYNC_BASE_PATH = ConfigProperty
.key("hoodie.gcp.bigquery.sync.base_path")
.noDefaultValue()
+ .withInferFunction(cfg -> Option.ofNullable(cfg.getString(META_SYNC_BASE_PATH)))
.withDocumentation("Base path of the hoodie table to sync");
public static final ConfigProperty<String> BIGQUERY_SYNC_PARTITION_FIELDS = ConfigProperty
.key("hoodie.gcp.bigquery.sync.partition_fields")
.noDefaultValue()
+ .withInferFunction(cfg -> Option.ofNullable(cfg.getString(HoodieTableConfig.PARTITION_FIELDS))
+ .or(() -> Option.ofNullable(cfg.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME))))
.withDocumentation("Comma-delimited partition fields. Default to non-partitioned.");
public static final ConfigProperty<Boolean> BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA = ConfigProperty
.key("hoodie.gcp.bigquery.sync.use_file_listing_from_metadata")
- .defaultValue(false)
+ .defaultValue(DEFAULT_METADATA_ENABLE_FOR_READERS)
+ .withInferFunction(cfg -> Option.of(cfg.getBooleanOrDefault(HoodieMetadataConfig.ENABLE, DEFAULT_METADATA_ENABLE_FOR_READERS)))
.withDocumentation("Fetch file listing from Hudi's metadata");
- public static final ConfigProperty<Boolean> BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING = ConfigProperty
+ public static final ConfigProperty<String> BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING = ConfigProperty
.key("hoodie.gcp.bigquery.sync.assume_date_partitioning")
- .defaultValue(false)
+ .defaultValue(HoodieMetadataConfig.ASSUME_DATE_PARTITIONING.defaultValue())
+ .withInferFunction(cfg -> Option.ofNullable(cfg.getString(HoodieMetadataConfig.ASSUME_DATE_PARTITIONING)))
.withDocumentation("Assume standard yyyy/mm/dd partitioning, this"
+ " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter");
public BigQuerySyncConfig(Properties props) {
super(props);
+ setDefaults(BigQuerySyncConfig.class.getName());
}
public static class BigQuerySyncConfigParams {
diff --git a/hudi-gcp/src/test/java/org/apache/hudi/gcp/bigquery/TestBigQuerySyncConfig.java b/hudi-gcp/src/test/java/org/apache/hudi/gcp/bigquery/TestBigQuerySyncConfig.java
index 82a85277384..6d96e9b9123 100644
--- a/hudi-gcp/src/test/java/org/apache/hudi/gcp/bigquery/TestBigQuerySyncConfig.java
+++ b/hudi-gcp/src/test/java/org/apache/hudi/gcp/bigquery/TestBigQuerySyncConfig.java
@@ -19,12 +19,16 @@
package org.apache.hudi.gcp.bigquery;
-import org.junit.jupiter.api.BeforeEach;
+import org.apache.hudi.common.config.HoodieMetadataConfig;
+import org.apache.hudi.common.table.HoodieTableConfig;
+import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
+
import org.junit.jupiter.api.Test;
import java.util.Arrays;
import java.util.Properties;
+import static org.apache.hudi.common.config.HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_DATASET_LOCATION;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_DATASET_NAME;
@@ -36,13 +40,12 @@ import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_SYNC
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_TABLE_NAME;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
public class TestBigQuerySyncConfig {
- BigQuerySyncConfig syncConfig;
-
- @BeforeEach
- void setUp() {
+ @Test
+ public void testGetConfigs() {
Properties props = new Properties();
props.setProperty(BIGQUERY_SYNC_PROJECT_ID.key(), "fooproject");
props.setProperty(BIGQUERY_SYNC_DATASET_NAME.key(), "foodataset");
@@ -54,11 +57,7 @@ public class TestBigQuerySyncConfig {
props.setProperty(BIGQUERY_SYNC_PARTITION_FIELDS.key(), "a,b");
props.setProperty(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA.key(), "true");
props.setProperty(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING.key(), "true");
- syncConfig = new BigQuerySyncConfig(props);
- }
-
- @Test
- public void testGetConfigs() {
+ BigQuerySyncConfig syncConfig = new BigQuerySyncConfig(props);
assertEquals("fooproject", syncConfig.getString(BIGQUERY_SYNC_PROJECT_ID));
assertEquals("foodataset", syncConfig.getString(BIGQUERY_SYNC_DATASET_NAME));
assertEquals("US", syncConfig.getString(BIGQUERY_SYNC_DATASET_LOCATION));
@@ -71,4 +70,71 @@ public class TestBigQuerySyncConfig {
assertEquals(true, syncConfig.getBoolean(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING));
}
+ @Test
+ public void testInferDatasetAndTableNames() {
+ Properties props1 = new Properties();
+ props1.setProperty(HoodieTableConfig.DATABASE_NAME.key(), "db1");
+ props1.setProperty(HoodieTableConfig.HOODIE_TABLE_NAME_KEY, "tbl1");
+ BigQuerySyncConfig config1 = new BigQuerySyncConfig(props1);
+ assertEquals("db1", config1.getString(BIGQUERY_SYNC_DATASET_NAME));
+ assertEquals("tbl1", config1.getString(BIGQUERY_SYNC_TABLE_NAME));
+
+ Properties props2 = new Properties();
+ props2.setProperty(HoodieTableConfig.DATABASE_NAME.key(), "db2");
+ props2.setProperty(HoodieTableConfig.HOODIE_WRITE_TABLE_NAME_KEY, "tbl2");
+ BigQuerySyncConfig config2 = new BigQuerySyncConfig(props2);
+ assertEquals("db2", config2.getString(BIGQUERY_SYNC_DATASET_NAME));
+ assertEquals("tbl2", config2.getString(BIGQUERY_SYNC_TABLE_NAME));
+ }
+
+ @Test
+ public void testInferPartitionFields() {
+ Properties props0 = new Properties();
+ BigQuerySyncConfig config0 = new BigQuerySyncConfig(props0);
+ assertNull(config0.getString(BIGQUERY_SYNC_PARTITION_FIELDS),
+ String.format("should get null due to absence of both %s and %s",
+ HoodieTableConfig.PARTITION_FIELDS.key(), KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()));
+
+ Properties props1 = new Properties();
+ props1.setProperty(HoodieTableConfig.PARTITION_FIELDS.key(), "foo,bar,baz");
+ BigQuerySyncConfig config1 = new BigQuerySyncConfig(props1);
+ assertEquals("foo,bar,baz", config1.getString(BIGQUERY_SYNC_PARTITION_FIELDS),
+ String.format("should infer from %s", HoodieTableConfig.PARTITION_FIELDS.key()));
+
+ Properties props2 = new Properties();
+ props2.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "foo,bar");
+ BigQuerySyncConfig config2 = new BigQuerySyncConfig(props2);
+ assertEquals("foo,bar", config2.getString(BIGQUERY_SYNC_PARTITION_FIELDS),
+ String.format("should infer from %s", KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()));
+
+ Properties props3 = new Properties();
+ props3.setProperty(HoodieTableConfig.PARTITION_FIELDS.key(), "foo,bar,baz");
+ props3.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "foo,bar");
+ BigQuerySyncConfig config3 = new BigQuerySyncConfig(props3);
+ assertEquals("foo,bar,baz", config3.getString(BIGQUERY_SYNC_PARTITION_FIELDS),
+ String.format("should infer from %s, which has higher precedence.", HoodieTableConfig.PARTITION_FIELDS.key()));
+
+ }
+
+ @Test
+ void testInferUseFileListingFromMetadata() {
+ BigQuerySyncConfig config1 = new BigQuerySyncConfig(new Properties());
+ assertEquals(DEFAULT_METADATA_ENABLE_FOR_READERS, config1.getBoolean(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA));
+
+ Properties props2 = new Properties();
+ props2.setProperty(HoodieMetadataConfig.ENABLE.key(), "true");
+ BigQuerySyncConfig config2 = new BigQuerySyncConfig(props2);
+ assertEquals(true, config2.getBoolean(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA));
+ }
+
+ @Test
+ void testInferAssumeDatePartition() {
+ BigQuerySyncConfig config1 = new BigQuerySyncConfig(new Properties());
+ assertEquals(false, config1.getBoolean(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING));
+
+ Properties props2 = new Properties();
+ props2.setProperty(HoodieMetadataConfig.ASSUME_DATE_PARTITIONING.key(), "true");
+ BigQuerySyncConfig config2 = new BigQuerySyncConfig(props2);
+ assertEquals(true, config2.getBoolean(BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING));
+ }
}