You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2022/02/17 19:51:24 UTC

[GitHub] [hudi] nsivabalan commented on a change in pull request #4175: [HUDI-2883] Refactor hive sync tool / config to use reflection and standardize configs

nsivabalan commented on a change in pull request #4175:
URL: https://github.com/apache/hudi/pull/4175#discussion_r809154493



##########
File path: hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/helpers/HiveServiceProvider.java
##########
@@ -46,12 +49,17 @@ public void startLocalHiveServiceIfNeeded(Configuration configuration) throws IO
   }
 
   public void syncToLocalHiveIfNeeded(HoodieTestSuiteWriter writer) {
+    HiveSyncTool hiveSyncTool;
     if (this.config.isHiveLocal()) {
-      writer.getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync()
-          .syncHive(getLocalHiveServer().getHiveConf());
+      hiveSyncTool = new HiveSyncTool(writer.getWriteConfig().getProps(),

Review comment:
       have we removed or deprecated DeltaSync.syncHive(...) in this patch? if not, would prefer to go via deltaSync syncHive api. 

##########
File path: hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala
##########
@@ -379,109 +380,92 @@ object DataSourceWriteOptions {
   // HIVE SYNC SPECIFIC CONFIGS
   // NOTE: DO NOT USE uppercase for the keys as they are internally lower-cased. Using upper-cases causes
   // unexpected issues with config getting reset
-  val HIVE_SYNC_ENABLED: ConfigProperty[String] = ConfigProperty
-    .key("hoodie.datasource.hive_sync.enable")
-    .defaultValue("false")
-    .withDocumentation("When set to true, register/sync the table to Apache Hive metastore")
-
-  val META_SYNC_ENABLED: ConfigProperty[String] = ConfigProperty
-    .key("hoodie.datasource.meta.sync.enable")
-    .defaultValue("false")
-    .withDocumentation("")
-
-  val HIVE_DATABASE: ConfigProperty[String] = ConfigProperty
-    .key("hoodie.datasource.hive_sync.database")
-    .defaultValue("default")
-    .withDocumentation("database to sync to")
-
-  val hiveTableOptKeyInferFunc = DataSourceOptionsHelper.scalaFunctionToJavaFunction((p: HoodieConfig) => {
-    if (p.contains(TABLE_NAME)) {
-      Option.of(p.getString(TABLE_NAME))
-    } else if (p.contains(HoodieWriteConfig.TBL_NAME)) {
-      Option.of(p.getString(HoodieWriteConfig.TBL_NAME))
-    } else {
-      Option.empty[String]()
-    }
-  })
-  val HIVE_TABLE: ConfigProperty[String] = ConfigProperty
-    .key("hoodie.datasource.hive_sync.table")
-    .defaultValue("unknown")
-    .withInferFunction(hiveTableOptKeyInferFunc)
-    .withDocumentation("table to sync to")
-
-  val HIVE_BASE_FILE_FORMAT: ConfigProperty[String] = ConfigProperty
-    .key("hoodie.datasource.hive_sync.base_file_format")
-    .defaultValue("PARQUET")
-    .withDocumentation("Base file format for the sync.")
-
-  val HIVE_USER: ConfigProperty[String] = ConfigProperty
-    .key("hoodie.datasource.hive_sync.username")
-    .defaultValue("hive")
-    .withDocumentation("hive user name to use")
-
-  val HIVE_PASS: ConfigProperty[String] = ConfigProperty
-    .key("hoodie.datasource.hive_sync.password")
-    .defaultValue("hive")
-    .withDocumentation("hive password to use")
-
-  val HIVE_URL: ConfigProperty[String] = ConfigProperty
-    .key("hoodie.datasource.hive_sync.jdbcurl")
-    .defaultValue("jdbc:hive2://localhost:10000")
-    .withDocumentation("Hive jdbc url")
-
-  val METASTORE_URIS: ConfigProperty[String] = ConfigProperty
-    .key("hoodie.datasource.hive_sync.metastore.uris")
-    .defaultValue("thrift://localhost:9083")
-    .withDocumentation("Hive metastore url")
-
-  val hivePartitionFieldsInferFunc = DataSourceOptionsHelper.scalaFunctionToJavaFunction((p: HoodieConfig) => {
-    if (p.contains(PARTITIONPATH_FIELD)) {
-      Option.of(p.getString(PARTITIONPATH_FIELD))
-    } else {
-      Option.empty[String]()
-    }
-  })
-  val HIVE_PARTITION_FIELDS: ConfigProperty[String] = ConfigProperty
-    .key("hoodie.datasource.hive_sync.partition_fields")
-    .defaultValue("")
-    .withDocumentation("Field in the table to use for determining hive partition columns.")
-    .withInferFunction(hivePartitionFieldsInferFunc)
-
-  val hivePartitionExtractorInferFunc = DataSourceOptionsHelper.scalaFunctionToJavaFunction((p: HoodieConfig) => {
-    if (!p.contains(PARTITIONPATH_FIELD)) {
-      Option.of(classOf[NonPartitionedExtractor].getName)
-    } else {
-      val numOfPartFields = p.getString(PARTITIONPATH_FIELD).split(",").length
-      if (numOfPartFields == 1 && p.contains(HIVE_STYLE_PARTITIONING) && p.getString(HIVE_STYLE_PARTITIONING) == "true") {
-        Option.of(classOf[HiveStylePartitionValueExtractor].getName)
-      } else {
-        Option.of(classOf[MultiPartKeysValueExtractor].getName)
-      }
-    }
-  })
-  val HIVE_PARTITION_EXTRACTOR_CLASS: ConfigProperty[String] = ConfigProperty
-    .key("hoodie.datasource.hive_sync.partition_extractor_class")
-    .defaultValue(classOf[SlashEncodedDayPartitionValueExtractor].getCanonicalName)
-    .withDocumentation("Class which implements PartitionValueExtractor to extract the partition values, "
-      + "default 'SlashEncodedDayPartitionValueExtractor'.")
-    .withInferFunction(hivePartitionExtractorInferFunc)
-
-  val HIVE_ASSUME_DATE_PARTITION: ConfigProperty[String] = ConfigProperty
-    .key("hoodie.datasource.hive_sync.assume_date_partitioning")
-    .defaultValue("false")
-    .withDocumentation("Assume partitioning is yyyy/mm/dd")
-
-  val HIVE_USE_PRE_APACHE_INPUT_FORMAT: ConfigProperty[String] = ConfigProperty
-    .key("hoodie.datasource.hive_sync.use_pre_apache_input_format")
-    .defaultValue("false")
-    .withDocumentation("Flag to choose InputFormat under com.uber.hoodie package instead of org.apache.hudi package. "
-      + "Use this when you are in the process of migrating from "
-      + "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to org.apache.hudi input format")
+  /**
+   * @deprecated Hive Specific Configs are moved to {@link HiveSyncConfig}
+   */
+  @Deprecated
+  val HIVE_SYNC_ENABLED: ConfigProperty[String] = HiveSyncConfig.HIVE_SYNC_ENABLED
+  @Deprecated
+  val META_SYNC_ENABLED: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_ENABLED
+  @Deprecated
+  val HIVE_DATABASE: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_DATABASE_NAME
+  @Deprecated
+  val hiveTableOptKeyInferFunc: JavaFunction[HoodieConfig, Option[String]] = HoodieSyncConfig.TABLE_NAME_INFERENCE_FUNCTION
+  @Deprecated
+  val HIVE_TABLE: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_TABLE_NAME
+  @Deprecated
+  val HIVE_BASE_FILE_FORMAT: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT
+  @Deprecated
+  val HIVE_USER: ConfigProperty[String] = HiveSyncConfig.HIVE_USER
+  @Deprecated
+  val HIVE_PASS: ConfigProperty[String] = HiveSyncConfig.HIVE_PASS
+  @Deprecated
+  val HIVE_URL: ConfigProperty[String] = HiveSyncConfig.HIVE_URL
+  @Deprecated
+  val METASTORE_URIS: ConfigProperty[String] = HiveSyncConfig.METASTORE_URIS
+  @Deprecated
+  val hivePartitionFieldsInferFunc: JavaFunction[HoodieConfig, Option[String]] = HoodieSyncConfig.PARTITION_FIELDS_INFERENCE_FUNCTION
+  @Deprecated
+  val HIVE_PARTITION_FIELDS: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_PARTITION_FIELDS
+  @Deprecated
+  val hivePartitionExtractorInferFunc: JavaFunction[HoodieConfig, Option[String]] = HoodieSyncConfig.PARTITION_EXTRACTOR_CLASS_FUNCTION
+  @Deprecated
+  val HIVE_PARTITION_EXTRACTOR_CLASS: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS
+  @Deprecated
+  val HIVE_ASSUME_DATE_PARTITION: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION
+  @Deprecated
+  val HIVE_USE_PRE_APACHE_INPUT_FORMAT: ConfigProperty[String] = HiveSyncConfig.HIVE_USE_PRE_APACHE_INPUT_FORMAT
 
   // spark data source write pool name. Incase of streaming sink, users might be interested to set custom scheduling configs
   // for regular writes and async compaction. In such cases, this pool name will be used for spark datasource writes.
   val SPARK_DATASOURCE_WRITER_POOL_NAME = "sparkdatasourcewrite"
 
+  /** @deprecated Use {@link HIVE_SYNC_MODE} instead of this config from 0.9.0 */
+  @Deprecated
+  val HIVE_USE_JDBC: ConfigProperty[String] = HiveSyncConfig.HIVE_USE_JDBC
+  @Deprecated
+  val HIVE_AUTO_CREATE_DATABASE: ConfigProperty[String] = HiveSyncConfig.HIVE_AUTO_CREATE_DATABASE
+  @Deprecated
+  val HIVE_IGNORE_EXCEPTIONS: ConfigProperty[String] = HiveSyncConfig.HIVE_IGNORE_EXCEPTIONS
+  @Deprecated
+  val HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE: ConfigProperty[String] = HiveSyncConfig.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE
+  @Deprecated
+  val HIVE_SUPPORT_TIMESTAMP_TYPE: ConfigProperty[String] = HiveSyncConfig.HIVE_SUPPORT_TIMESTAMP_TYPE
+
+  /**
+   * Flag to indicate whether to use conditional syncing in HiveSync.
+   * If set true, the Hive sync procedure will only run if partition or schema changes are detected.
+   * By default true.
+   */
+  @Deprecated
+  val HIVE_CONDITIONAL_SYNC: ConfigProperty[String] = HoodieSyncConfig.META_SYNC_CONDITIONAL_SYNC
+  @Deprecated
+  val HIVE_TABLE_PROPERTIES: ConfigProperty[String] = HiveSyncConfig.HIVE_TABLE_PROPERTIES
+  @Deprecated
+  val HIVE_TABLE_SERDE_PROPERTIES: ConfigProperty[String] = HiveSyncConfig.HIVE_TABLE_SERDE_PROPERTIES
+  @Deprecated
+  val HIVE_SYNC_AS_DATA_SOURCE_TABLE: ConfigProperty[String] = HiveSyncConfig.HIVE_SYNC_AS_DATA_SOURCE_TABLE
+
+  // Create table as managed table
+  @Deprecated
+  val HIVE_CREATE_MANAGED_TABLE: ConfigProperty[Boolean] = ConfigProperty

Review comment:
       we can make this similar to L461. 
   val HIVE_CREATE_MANAGED_TABLE: ConfigProperty[Boolean] = HiveSyncConfig.HIVE_CREATE_MANAGED_TABLE; 
   
   and other configProperties here in lines 451 to 467

##########
File path: hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/util/SyncUtilHelpers.java
##########
@@ -0,0 +1,54 @@
+package org.apache.hudi.sync.common.util;
+
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.util.ReflectionUtils;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.sync.common.AbstractSyncTool;
+import org.apache.hudi.sync.common.HoodieSyncConfig;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+
+import java.util.Properties;
+
+/**
+ * Helper class for syncing Hudi commit data with external metastores.
+ */
+public class SyncUtilHelpers {
+
+  /**
+   * Create an instance of an implementation of {@link AbstractSyncTool} that will sync all the relevant meta information
+   * with an external metastore such as Hive etc. to ensure Hoodie tables can be queried or read via external systems.
+   *
+   * @param metaSyncClass  The class that implements the sync of the metadata.
+   * @param props          property map.
+   * @param hadoopConfig   Hadoop confs.
+   * @param fs             Filesystem used.
+   * @param targetBasePath The target base path that contains the hoodie table.
+   * @param baseFileFormat The file format used by the hoodie table (defaults to PARQUET).
+   * @return
+   */
+  public static void createAndSyncHoodieMeta(String metaSyncClass,
+                                             TypedProperties props,
+                                             Configuration hadoopConfig,
+                                             FileSystem fs,
+                                             String targetBasePath,
+                                             String baseFileFormat) {
+    TypedProperties properties = new TypedProperties();
+    properties.putAll(props);
+    properties.put(HoodieSyncConfig.META_SYNC_BASE_PATH, targetBasePath);
+    properties.put(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT, baseFileFormat);
+
+    try {
+      ((AbstractSyncTool) ReflectionUtils.loadClass(metaSyncClass,
+          new Class<?>[] {TypedProperties.class, Configuration.class, FileSystem.class},
+          properties, hadoopConfig, fs)).syncHoodieTable();
+    } catch (HoodieException e) {
+      // fallback to old interface
+      ((AbstractSyncTool) ReflectionUtils.loadClass(metaSyncClass,

Review comment:
       do we have tests to test this fallback 

##########
File path: hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java
##########
@@ -167,43 +161,10 @@ private void syncMeta() {
     if (connectConfigs.isMetaSyncEnabled()) {
       Set<String> syncClientToolClasses = new HashSet<>(
           Arrays.asList(connectConfigs.getMetaSyncClasses().split(",")));
+      FileSystem fs = FSUtils.getFs(tableBasePath, new Configuration());
       for (String impl : syncClientToolClasses) {
-        impl = impl.trim();
-        switch (impl) {
-          case "org.apache.hudi.hive.HiveSyncTool":
-            syncHive();
-            break;
-          default:
-            FileSystem fs = FSUtils.getFs(tableBasePath, new Configuration());
-            Properties properties = new Properties();
-            properties.putAll(connectConfigs.getProps());
-            properties.put("basePath", tableBasePath);
-            AbstractSyncTool syncTool = (AbstractSyncTool) ReflectionUtils.loadClass(impl, new Class[] {Properties.class, FileSystem.class}, properties, fs);
-            syncTool.syncHoodieTable();
-        }
+        SyncUtilHelpers.createAndSyncHoodieMeta(impl.trim(), connectConfigs.getProps(), hadoopConf, fs, tableBasePath, HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.defaultValue());

Review comment:
       this looks neat :) 

##########
File path: hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java
##########
@@ -52,48 +41,31 @@
   @Parameter(names = {"--metastore-uris"}, description = "Hive metastore uris")
   public String metastoreUris;
 
-  @Parameter(names = {"--base-path"}, description = "Basepath of hoodie table to sync", required = true)
-  public String basePath;
-
-  @Parameter(names = "--partitioned-by", description = "Fields in the schema partitioned by")
-  public List<String> partitionFields = new ArrayList<>();
-
-  @Parameter(names = "--partition-value-extractor", description = "Class which implements PartitionValueExtractor "
-      + "to extract the partition values from HDFS path")
-  public String partitionValueExtractorClass = SlashEncodedDayPartitionValueExtractor.class.getName();
-
-  @Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this"
-      + " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter")
-  public Boolean assumeDatePartitioning = false;
-
   @Parameter(names = {"--use-pre-apache-input-format"},
       description = "Use InputFormat under com.uber.hoodie package "
           + "instead of org.apache.hudi package. Use this when you are in the process of migrating from "
           + "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to "
           + "org.apache.hudi input format.")
-  public Boolean usePreApacheInputFormat = false;
+  public Boolean usePreApacheInputFormat;
 
   @Parameter(names = {"--bucket-spec"}, description = "bucket spec stored in metastore", required = false)
   public String bucketSpec;
 
   @Deprecated
   @Parameter(names = {"--use-jdbc"}, description = "Hive jdbc connect url")
-  public Boolean useJdbc = true;
+  public Boolean useJdbc;
 
   @Parameter(names = {"--sync-mode"}, description = "Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql")
   public String syncMode;
 
   @Parameter(names = {"--auto-create-database"}, description = "Auto create hive database")
-  public Boolean autoCreateDatabase = true;
+  public Boolean autoCreateDatabase;

Review comment:
       same question as above ?

##########
File path: hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableDropPartitionCommand.scala
##########
@@ -102,15 +103,15 @@ case class AlterHoodieTableDropPartitionCommand(
         RECORDKEY_FIELD.key -> hoodieCatalogTable.primaryKeys.mkString(","),
         PRECOMBINE_FIELD.key -> hoodieCatalogTable.preCombineKey.getOrElse(""),
         PARTITIONPATH_FIELD.key -> partitionFields,
-        HIVE_SYNC_ENABLED.key -> enableHive.toString,
-        META_SYNC_ENABLED.key -> enableHive.toString,
-        HIVE_SYNC_MODE.key -> HiveSyncMode.HMS.name(),
-        HIVE_USE_JDBC.key -> "false",
-        HIVE_DATABASE.key -> hoodieCatalogTable.table.identifier.database.getOrElse("default"),
-        HIVE_TABLE.key -> hoodieCatalogTable.table.identifier.table,
-        HIVE_SUPPORT_TIMESTAMP_TYPE.key -> "true",
-        HIVE_PARTITION_FIELDS.key -> partitionFields,
-        HIVE_PARTITION_EXTRACTOR_CLASS.key -> classOf[MultiPartKeysValueExtractor].getCanonicalName
+        HoodieSyncConfig.META_SYNC_ENABLED.key -> enableHive.toString,

Review comment:
       I always wanted to understand these two configs. when would someone enable META_SYNC_ENABLED but not enable HIVE_SYNC_ENABLED ? can you help me understand the diff between these two configs. 

##########
File path: hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java
##########
@@ -276,6 +276,7 @@ public static HoodieRecord createHoodieRecord(GenericRecord gr, HoodieKey hKey,
     return dropDuplicates(jssc, incomingHoodieRecords, writeConfig);
   }
 
+  @Deprecated

Review comment:
       can you add a java doc as to whats the new api to use to build HiveSyncConfig.

##########
File path: hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java
##########
@@ -52,48 +41,31 @@
   @Parameter(names = {"--metastore-uris"}, description = "Hive metastore uris")
   public String metastoreUris;
 
-  @Parameter(names = {"--base-path"}, description = "Basepath of hoodie table to sync", required = true)
-  public String basePath;
-
-  @Parameter(names = "--partitioned-by", description = "Fields in the schema partitioned by")
-  public List<String> partitionFields = new ArrayList<>();
-
-  @Parameter(names = "--partition-value-extractor", description = "Class which implements PartitionValueExtractor "
-      + "to extract the partition values from HDFS path")
-  public String partitionValueExtractorClass = SlashEncodedDayPartitionValueExtractor.class.getName();
-
-  @Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this"
-      + " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter")
-  public Boolean assumeDatePartitioning = false;
-
   @Parameter(names = {"--use-pre-apache-input-format"},
       description = "Use InputFormat under com.uber.hoodie package "
           + "instead of org.apache.hudi package. Use this when you are in the process of migrating from "
           + "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to "
           + "org.apache.hudi input format.")
-  public Boolean usePreApacheInputFormat = false;
+  public Boolean usePreApacheInputFormat;
 
   @Parameter(names = {"--bucket-spec"}, description = "bucket spec stored in metastore", required = false)
   public String bucketSpec;
 
   @Deprecated
   @Parameter(names = {"--use-jdbc"}, description = "Hive jdbc connect url")
-  public Boolean useJdbc = true;
+  public Boolean useJdbc;

Review comment:
       are we flipping the default here intentionally ? 

##########
File path: hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java
##########
@@ -106,31 +78,163 @@
 
   @Parameter(names = {"--support-timestamp"}, description = "'INT64' with original type TIMESTAMP_MICROS is converted to hive 'timestamp' type."
       + "Disabled by default for backward compatibility.")
-  public Boolean supportTimestamp = false;
-
-  @Parameter(names = {"--decode-partition"}, description = "Decode the partition value if the partition has encoded during writing")
-  public Boolean decodePartition = false;
+  public Boolean supportTimestamp;
 
   @Parameter(names = {"--managed-table"}, description = "Create a managed table")
-  public Boolean createManagedTable = false;
+  public Boolean createManagedTable;
 
   @Parameter(names = {"--batch-sync-num"}, description = "The number of partitions one batch when synchronous partitions to hive")
-  public Integer batchSyncNum = 1000;
+  public Integer batchSyncNum;
 
   @Parameter(names = {"--spark-datasource"}, description = "Whether sync this table as spark data source table.")
-  public Boolean syncAsSparkDataSourceTable = true;
+  public Boolean syncAsSparkDataSourceTable;
 
   @Parameter(names = {"--spark-schema-length-threshold"}, description = "The maximum length allowed in a single cell when storing additional schema information in Hive's metastore.")
-  public int sparkSchemaLengthThreshold = 4000;
+  public int sparkSchemaLengthThreshold;
 
   @Parameter(names = {"--with-operation-field"}, description = "Whether to include the '_hoodie_operation' field in the metadata fields")
   public Boolean withOperationField = false;
 
-  @Parameter(names = {"--conditional-sync"}, description = "If true, only sync on conditions like schema change or partition change.")
-  public Boolean isConditionalSync = false;
 
-  @Parameter(names = {"--spark-version"}, description = "The spark version", required = false)
-  public String sparkVersion;
+  // HIVE SYNC SPECIFIC CONFIGS
+  // NOTE: DO NOT USE uppercase for the keys as they are internally lower-cased. Using upper-cases causes
+  // unexpected issues with config getting reset
+  public static final ConfigProperty<String> HIVE_SYNC_ENABLED = ConfigProperty
+      .key("hoodie.datasource.hive_sync.enable")
+      .defaultValue("false")
+      .withDocumentation("When set to true, register/sync the table to Apache Hive metastore.");
+
+  public static final ConfigProperty<String> HIVE_USER = ConfigProperty
+      .key("hoodie.datasource.hive_sync.username")
+      .defaultValue("hive")
+      .withDocumentation("hive user name to use");
+
+  public static final ConfigProperty<String> HIVE_PASS = ConfigProperty
+      .key("hoodie.datasource.hive_sync.password")
+      .defaultValue("hive")
+      .withDocumentation("hive password to use");
+
+  public static final ConfigProperty<String> HIVE_URL = ConfigProperty
+      .key("hoodie.datasource.hive_sync.jdbcurl")
+      .defaultValue("jdbc:hive2://localhost:10000")
+      .withDocumentation("Hive metastore url");
+
+  public static final ConfigProperty<String> HIVE_USE_PRE_APACHE_INPUT_FORMAT = ConfigProperty
+      .key("hoodie.datasource.hive_sync.use_pre_apache_input_format")
+      .defaultValue("false")
+      .withDocumentation("Flag to choose InputFormat under com.uber.hoodie package instead of org.apache.hudi package. "
+          + "Use this when you are in the process of migrating from "
+          + "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to org.apache.hudi input format");
+
+  /**
+   * @deprecated Use {@link HIVE_SYNC_MODE} instead of this config from 0.9.0
+   */
+  @Deprecated
+  public static final ConfigProperty<String> HIVE_USE_JDBC = ConfigProperty
+      .key("hoodie.datasource.hive_sync.use_jdbc")
+      .defaultValue("true")

Review comment:
       ah, I see the reason why you have removed it above.

##########
File path: hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncConfig.java
##########
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.sync.common;
+
+import org.apache.hudi.common.config.ConfigProperty;
+import org.apache.hudi.common.config.HoodieConfig;
+import org.apache.hudi.common.config.HoodieMetadataConfig;
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.table.HoodieTableConfig;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
+
+import com.beust.jcommander.Parameter;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.Function;
+
+/**
+ * Configs needed to sync data into external meta stores, catalogs, etc.
+ */
+public class HoodieSyncConfig extends HoodieConfig {
+
+  public static final String META_SYNC_BASE_PATH = "meta.sync.base.path";
+
+  @Parameter(names = {"--database"}, description = "name of the target database in Hive", required = true)
+  public String databaseName;
+
+  @Parameter(names = {"--table"}, description = "name of the target table in Hive", required = true)
+  public String tableName;
+
+  @Parameter(names = {"--base-path"}, description = "Basepath of hoodie table to sync", required = true)
+  public String basePath;
+
+  @Parameter(names = {"--base-file-format"}, description = "Format of the base files (PARQUET (or) HFILE)")
+  public String baseFileFormat;
+
+  @Parameter(names = "--partitioned-by", description = "Fields in the schema partitioned by")
+  public List<String> partitionFields;
+
+  @Parameter(names = "--partition-value-extractor", description = "Class which implements PartitionValueExtractor "
+      + "to extract the partition values from HDFS path")
+  public String partitionValueExtractorClass;
+
+  @Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this"
+      + " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter")
+  public Boolean assumeDatePartitioning;
+
+  @Parameter(names = {"--decode-partition"}, description = "Decode the partition value if the partition has encoded during writing")
+  public Boolean decodePartition;
+
+  @Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata")
+  public Boolean useFileListingFromMetadata;
+
+  @Parameter(names = {"--conditional-sync"}, description = "If true, only sync on conditions like schema change or partition change.")
+  public Boolean isConditionalSync;
+
+  @Parameter(names = {"--spark-version"}, description = "The spark version", required = false)
+  public String sparkVersion;
+
+  public static final ConfigProperty<String> META_SYNC_ENABLED = ConfigProperty
+      .key("hoodie.datasource.meta.sync.enable")
+      .defaultValue("false")
+      .withDocumentation("Enable Syncing the Hudi Table with an external meta store or data catalog.");
+
+  // ToDo change the prefix of the following configs from hive_sync to meta_sync
+  public static final ConfigProperty<String> META_SYNC_DATABASE_NAME = ConfigProperty
+      .key("hoodie.datasource.hive_sync.database")
+      .defaultValue("default")
+      .withDocumentation("The name of the destination database that we should sync the hudi table to.");
+
+  // If the table name for the metastore destination is not provided, pick it up from write or table configs.
+  public static final Function<HoodieConfig, Option<String>> TABLE_NAME_INFERENCE_FUNCTION = cfg -> {
+    if (cfg.contains(HoodieTableConfig.HOODIE_WRITE_TABLE_NAME_KEY)) {
+      return Option.of(cfg.getString(HoodieTableConfig.HOODIE_WRITE_TABLE_NAME_KEY));
+    } else if (cfg.contains(HoodieTableConfig.HOODIE_TABLE_NAME_KEY)) {
+      return Option.of(cfg.getString(HoodieTableConfig.HOODIE_TABLE_NAME_KEY));
+    } else {
+      return Option.empty();
+    }
+  };
+  public static final ConfigProperty<String> META_SYNC_TABLE_NAME = ConfigProperty
+      .key("hoodie.datasource.hive_sync.table")
+      .defaultValue("unknown")

Review comment:
       shouldn't this no default ? 

##########
File path: hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHiveIncrementalPuller.java
##########
@@ -141,6 +144,7 @@ public void testPullerWithoutIncrementalClause() throws IOException, URISyntaxEx
             "select name from testdb.test1"));
     Exception e = assertThrows(HoodieIncrementalPullSQLException.class, puller::saveDelta,
             "Should fail when incremental clause not provided!");
+    System.out.println("WNI " + e.getMessage());

Review comment:
       can you remove this? 
   CC @vinothchandar @yihua : Rajesh's love for WNI :)  

##########
File path: hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestUtil.java
##########
@@ -112,16 +117,21 @@ public static void setUp() throws IOException, InterruptedException, HiveExcepti
     }
     fileSystem = FileSystem.get(configuration);
 
-    hiveSyncConfig = new HiveSyncConfig();
-    hiveSyncConfig.jdbcUrl = hiveTestService.getJdbcHive2Url();
-    hiveSyncConfig.hiveUser = "";
-    hiveSyncConfig.hivePass = "";
-    hiveSyncConfig.databaseName = "testdb";
-    hiveSyncConfig.tableName = "test1";
-    hiveSyncConfig.basePath = Files.createTempDirectory("hivesynctest" + Instant.now().toEpochMilli()).toUri().toString();
-    hiveSyncConfig.assumeDatePartitioning = true;
-    hiveSyncConfig.usePreApacheInputFormat = false;
-    hiveSyncConfig.partitionFields = Collections.singletonList("datestr");
+    basePath = Files.createTempDirectory("hivesynctest" + Instant.now().toEpochMilli()).toUri().toString();
+
+    hiveSyncProps = new TypedProperties();
+    hiveSyncProps.setProperty(HiveSyncConfig.HIVE_URL.key(), hiveTestService.getJdbcHive2Url());
+    hiveSyncProps.setProperty(HiveSyncConfig.HIVE_USER.key(), "");
+    hiveSyncProps.setProperty(HiveSyncConfig.HIVE_PASS.key(), "");
+    hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), DB_NAME);
+    hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_TABLE_NAME.key(), TABLE_NAME);
+    hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_BASE_PATH, basePath);
+    hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_ASSUME_DATE_PARTITION.key(), "true");
+    hiveSyncProps.setProperty(HiveSyncConfig.HIVE_USE_PRE_APACHE_INPUT_FORMAT.key(), "false");
+    hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr");
+    hiveSyncProps.setProperty(HiveSyncConfig.HIVE_BATCH_SYNC_PARTITION_NUM.key(), "3");

Review comment:
       I don't see you are setting jdbcUrl and you are additionally setting this BATCH_SYNC_PARTITION_NUM in this patch. is that intentional ? 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org