You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2022/03/11 11:24:20 UTC

[GitHub] [hudi] xushiyan commented on a change in pull request #4175: [HUDI-2883] Refactor hive sync tool / config to use reflection and standardize configs

xushiyan commented on a change in pull request #4175:
URL: https://github.com/apache/hudi/pull/4175#discussion_r824606178



##########
File path: hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestUtil.java
##########
@@ -89,16 +89,21 @@
 @SuppressWarnings("SameParameterValue")
 public class HiveTestUtil {
 
+  public static final String DB_NAME = "testdb";
+  public static String TABLE_NAME = "test1";

Review comment:
       TABLE_NAME also final?

##########
File path: hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestUtil.java
##########
@@ -112,16 +117,21 @@ public static void setUp() throws IOException, InterruptedException, HiveExcepti
     }
     fileSystem = FileSystem.get(configuration);
 
-    hiveSyncConfig = new HiveSyncConfig();
-    hiveSyncConfig.jdbcUrl = hiveTestService.getJdbcHive2Url();
-    hiveSyncConfig.hiveUser = "";
-    hiveSyncConfig.hivePass = "";
-    hiveSyncConfig.databaseName = "testdb";
-    hiveSyncConfig.tableName = "test1";
-    hiveSyncConfig.basePath = Files.createTempDirectory("hivesynctest" + Instant.now().toEpochMilli()).toUri().toString();
-    hiveSyncConfig.assumeDatePartitioning = true;
-    hiveSyncConfig.usePreApacheInputFormat = false;
-    hiveSyncConfig.partitionFields = Collections.singletonList("datestr");
+    basePath = Files.createTempDirectory("hivesynctest" + Instant.now().toEpochMilli()).toUri().toString();
+
+    hiveSyncProps = new TypedProperties();
+    hiveSyncProps.setProperty(HiveSyncConfig.HIVE_URL.key(), hiveTestService.getJdbcHive2Url());
+    hiveSyncProps.setProperty(HiveSyncConfig.HIVE_USER.key(), "");
+    hiveSyncProps.setProperty(HiveSyncConfig.HIVE_PASS.key(), "");
+    hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_DATABASE_NAME.key(), DB_NAME);
+    hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_TABLE_NAME.key(), TABLE_NAME);
+    hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_BASE_PATH, basePath);
+    hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_ASSUME_DATE_PARTITION.key(), "true");
+    hiveSyncProps.setProperty(HiveSyncConfig.HIVE_USE_PRE_APACHE_INPUT_FORMAT.key(), "false");
+    hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr");
+    hiveSyncProps.setProperty(HiveSyncConfig.HIVE_BATCH_SYNC_PARTITION_NUM.key(), "3");
+
+    hiveSyncConfig = new HiveSyncConfig(hiveSyncProps);

Review comment:
       i'd prefer to have builder pattern to reliably construct `HiveSyncConfig` instead of passing uncontrollable "raw" props.

##########
File path: hudi-sync/hudi-sync-common/src/test/java/org/apache/hudi/sync/common/util/TestSyncUtilHelpers.java
##########
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.sync.common.util;
+
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.sync.common.AbstractSyncTool;
+
+import java.io.IOException;
+import java.util.Properties;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class TestSyncUtilHelpers {
+  private static final String BASE_PATH = "/tmp/test";
+  private static final String BASE_FORMAT = "PARQUET";
+
+  private static Configuration configuration;
+  private static FileSystem fileSystem;

Review comment:
       ```suggestion
     private Configuration configuration;
     private FileSystem fileSystem;
   ```
   
   static vars could run into issues if enable parallel testing 

##########
File path: hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/BootstrapExecutor.java
##########
@@ -161,12 +160,16 @@ public void execute() throws IOException {
    */
   private void syncHive() {
     if (cfg.enableHiveSync || cfg.enableMetaSync) {
-      HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath, cfg.baseFileFormat);
-      HiveConf hiveConf = new HiveConf(fs.getConf(), HiveConf.class);
-      hiveConf.set(HiveConf.ConfVars.METASTOREURIS.varname,hiveSyncConfig.metastoreUris);
-      LOG.info("Hive Conf => " + hiveConf.getAllProperties().toString());
-      LOG.info("Hive Sync Conf => " + hiveSyncConfig);
-      new HiveSyncTool(hiveSyncConfig, new HiveConf(configuration, HiveConf.class), fs).syncHoodieTable();
+      TypedProperties metaProps = new TypedProperties();
+      metaProps.putAll(props);
+      metaProps.put(HoodieSyncConfig.META_SYNC_BASE_PATH, cfg.targetBasePath);
+      metaProps.put(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT, cfg.baseFileFormat);
+      if (props.getBoolean(HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC.key(), HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC.defaultValue())) {
+        metaProps.put(HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC_SPEC, HiveSyncConfig.getBucketSpec(props.getString(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key()),
+            props.getInteger(HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS.key())));
+      }

Review comment:
       with builder pattern we should be able to simplify this sort of props construction

##########
File path: hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncTool.java
##########
@@ -17,17 +17,31 @@
 
 package org.apache.hudi.sync.common;
 
+import org.apache.hudi.common.config.TypedProperties;
+
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 
 import java.util.Properties;
 
+/**
+ * Base class to sync Hudi meta data with Metastores to make
+ * Hudi table queryable through external systems.
+ */
 public abstract class AbstractSyncTool {
-  protected Properties props;
-  protected FileSystem fileSystem;
+  protected final Configuration conf;
+  protected final FileSystem fs;
+  protected TypedProperties props;
 
-  public AbstractSyncTool(Properties props, FileSystem fileSystem) {
+  public AbstractSyncTool(TypedProperties props, Configuration conf, FileSystem fs) {

Review comment:
       I think a cleaner interface would be 
   
   ```suggestion
     public AbstractSyncTool(HoodieSyncConfig syncConfig, FileSystem fs) {
   ```
   
   `hadoopConf` should be set by `fs.getConf()`. And `props` is not controllable; hard for user to decide what to pass in here. 
   
   `HoodieSyncConfig` is the matching config class for `SyncTool` as the names suggest. Internally we can still do `props = syncConfig.getProps();`

##########
File path: hudi-sync/hudi-sync-common/src/test/java/org/apache/hudi/sync/common/util/TestSyncUtilHelpers.java
##########
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.sync.common.util;
+
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.sync.common.AbstractSyncTool;
+
+import java.io.IOException;
+import java.util.Properties;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class TestSyncUtilHelpers {
+  private static final String BASE_PATH = "/tmp/test";
+  private static final String BASE_FORMAT = "PARQUET";
+
+  private static Configuration configuration;
+  private static FileSystem fileSystem;
+
+  @BeforeEach
+  public static void setUp() throws IOException {

Review comment:
       
   ```suggestion
     @BeforeEach
     public void setUp() throws IOException {
   ```
   
   OR did you mean to use `@BeforeAll` ?
   

##########
File path: hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/DLASyncTool.java
##########
@@ -205,7 +206,8 @@ public static void main(String[] args) {
       cmd.usage();
       System.exit(1);
     }
-    FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration());
-    new DLASyncTool(Utils.configToProperties(cfg), fs).syncHoodieTable();
+    Configuration hadoopConf = new Configuration();
+    FileSystem fs = FSUtils.getFs(cfg.basePath, hadoopConf);
+    new DLASyncTool(Utils.configToProperties(cfg), hadoopConf, fs).syncHoodieTable();

Review comment:
       `FSUtils.getFS()` alters hadoopConf internally and the caller has no idea about it. This looks like passing an empty hadoopConf to it, though the conf was updated. So suggest passing `fs.getConf()` to DLASyncTool to avoid the confusion.

##########
File path: hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestHiveSyncTool.java
##########
@@ -1115,31 +1036,40 @@ public void testTypeConverter(String syncMode) throws Exception {
   @ParameterizedTest
   @MethodSource("syncMode")
   public void testSyncWithoutDiffs(String syncMode) throws Exception {
-    hiveSyncConfig.syncMode = syncMode;
-    hiveSyncConfig.isConditionalSync = true;
-    HiveTestUtil.hiveSyncConfig.batchSyncNum = 2;
-    String tableName = HiveTestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE;
+    String tableName = HiveTestUtil.TABLE_NAME + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE;
+    hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
+    hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_CONDITIONAL_SYNC.key(), "true");
 
     String commitTime0 = "100";
     String commitTime1 = "101";
     String commitTime2 = "102";
     HiveTestUtil.createMORTable(commitTime0, commitTime1, 2, true, true);
 
-    HoodieHiveClient hiveClient =
-        new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
-
-    HiveSyncTool tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
-    tool.syncHoodieTable();
+    reinitHiveSyncClient();
+    reSyncHiveTable();
 
     assertTrue(hiveClient.doesTableExist(tableName));
     assertEquals(commitTime1, hiveClient.getLastCommitTimeSynced(tableName).get());
 
     HiveTestUtil.addMORPartitions(0, true, true, true, ZonedDateTime.now().plusDays(2), commitTime1, commitTime2);
 
-    tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
-    tool.syncHoodieTable();
-    hiveClient = new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
+    reSyncHiveTable();
     assertEquals(commitTime1, hiveClient.getLastCommitTimeSynced(tableName).get());
   }
 
+  private void reSyncHiveTable() {
+    hiveSyncTool.syncHoodieTable();
+    // we need renew the hiveclient after tool.syncHoodieTable(), because it will close hive
+    // session, then lead to connection retry, we can see there is a exception at log.
+    reinitHiveSyncClient();
+  }
+
+  private void reinitHiveSyncClient() {
+    hiveSyncTool = new HiveSyncTool(hiveSyncProps, HiveTestUtil.getHiveConf(), fileSystem);
+    hiveClient = hiveSyncTool.hoodieHiveClient;
+  }
+
+  private int getPartitionFieldSize() {
+    return hiveSyncProps.getString(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key()).split(",").length;
+  }

Review comment:
       optional but i'd prefer to have static helpers for good portability.

##########
File path: hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/util/SyncUtilHelpers.java
##########
@@ -0,0 +1,67 @@
+package org.apache.hudi.sync.common.util;
+
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.util.ReflectionUtils;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.sync.common.AbstractSyncTool;
+import org.apache.hudi.sync.common.HoodieSyncConfig;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+
+import java.util.Properties;
+
+/**
+ * Helper class for syncing Hudi commit data with external metastores.
+ */
+public class SyncUtilHelpers {
+
+  /**
+   * Create an instance of an implementation of {@link AbstractSyncTool} that will sync all the relevant meta information
+   * with an external metastore such as Hive etc. to ensure Hoodie tables can be queried or read via external systems.
+   *
+   * @param metaSyncClass  The class that implements the sync of the metadata.
+   * @param props          property map.
+   * @param hadoopConfig   Hadoop confs.
+   * @param fs             Filesystem used.
+   * @param targetBasePath The target base path that contains the hoodie table.
+   * @param baseFileFormat The file format used by the hoodie table (defaults to PARQUET).
+   * @return
+   */
+  public static void createAndSyncHoodieMeta(String metaSyncClass,
+                                             TypedProperties props,
+                                             Configuration hadoopConfig,
+                                             FileSystem fs,
+                                             String targetBasePath,
+                                             String baseFileFormat) {
+    try {
+      createMetaSyncClass(metaSyncClass, props, hadoopConfig, fs, targetBasePath, baseFileFormat).syncHoodieTable();
+    } catch (Throwable e) {
+      throw new HoodieException("Could not sync using the meta sync class " + metaSyncClass, e);
+    }
+  }
+
+  static AbstractSyncTool createMetaSyncClass(String metaSyncClass,

Review comment:
       IIUC "meta sync" and "hudi sync" or "hoodie sync" mean the same thing. Since we call the module `hudi-sync` then we should just stick to it and call `hoodieSync` throughout the codebase. no `metaSync` at all. WDYT?

##########
File path: hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncTool.java
##########
@@ -17,17 +17,31 @@
 
 package org.apache.hudi.sync.common;
 
+import org.apache.hudi.common.config.TypedProperties;
+
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 
 import java.util.Properties;
 
+/**
+ * Base class to sync Hudi meta data with Metastores to make
+ * Hudi table queryable through external systems.
+ */
 public abstract class AbstractSyncTool {
-  protected Properties props;
-  protected FileSystem fileSystem;
+  protected final Configuration conf;
+  protected final FileSystem fs;
+  protected TypedProperties props;
 
-  public AbstractSyncTool(Properties props, FileSystem fileSystem) {
+  public AbstractSyncTool(TypedProperties props, Configuration conf, FileSystem fs) {
     this.props = props;
-    this.fileSystem = fileSystem;
+    this.conf = conf;
+    this.fs = fs;
+  }
+
+  @Deprecated

Review comment:
       ```suggestion
     /**
      * @deprecated <some notes>
      */
     @Deprecated
   ```

##########
File path: hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncConfig.java
##########
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.sync.common;
+
+import org.apache.hudi.common.config.ConfigProperty;
+import org.apache.hudi.common.config.HoodieConfig;
+import org.apache.hudi.common.config.HoodieMetadataConfig;
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.table.HoodieTableConfig;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
+
+import com.beust.jcommander.Parameter;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.Function;
+
+/**
+ * Configs needed to sync data into external meta stores, catalogs, etc.
+ */
+public class HoodieSyncConfig extends HoodieConfig {

Review comment:
       like other config classes, would be helpful to have builder support and `getProps()` API.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org