You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by "bvaradar (via GitHub)" <gi...@apache.org> on 2023/03/12 00:29:39 UTC

[GitHub] [hudi] bvaradar commented on a diff in pull request #7834: [HUDI-5690] Add simpleBucketPartitioner to support using the simple bucket index under bulkinsert

bvaradar commented on code in PR #7834:
URL: https://github.com/apache/hudi/pull/7834#discussion_r1133165678


##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/storage/HoodieSimpleBucketLayout.java:
##########
@@ -34,6 +34,7 @@ public class HoodieSimpleBucketLayout extends HoodieStorageLayout {
   public static final Set<WriteOperationType> SUPPORTED_OPERATIONS = CollectionUtils.createImmutableSet(
       WriteOperationType.INSERT,
       WriteOperationType.INSERT_PREPPED,
+      WriteOperationType.BULK_INSERT,

Review Comment:
   @wuwenchi @YuweiXiao : should HoodieBucketIndex.requiresTagging also return True for Bulk_Index ? 



##########
hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDBucketIndexPartitioner.java:
##########
@@ -18,15 +18,155 @@
 
 package org.apache.hudi.execution.bulkinsert;
 
+import org.apache.avro.Schema;
+import org.apache.hudi.avro.HoodieAvroUtils;
+import org.apache.hudi.common.config.SerializableSchema;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.model.HoodieKey;
 import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.model.HoodieTableType;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.hudi.common.util.collection.FlatLists;
+import org.apache.hudi.io.AppendHandleFactory;
+import org.apache.hudi.io.SingleFileHandleCreateFactory;
+import org.apache.hudi.io.WriteHandleFactory;
 import org.apache.hudi.table.BulkInsertPartitioner;
 
+import org.apache.hudi.table.HoodieTable;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.spark.Partitioner;
 import org.apache.spark.api.java.JavaRDD;
+import scala.Tuple2;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.List;
+
 
 /**
  * Abstract of bucket index bulk_insert partitioner
- * TODO implement partitioner for SIMPLE BUCKET INDEX
  */
 public abstract class RDDBucketIndexPartitioner<T>
     implements BulkInsertPartitioner<JavaRDD<HoodieRecord<T>>> {
+

Review Comment:
   HoodieBucketIndex is defined in Engine agnostic way (uses HoodieData and HoodieEngineContext). Can we also define the base partitioner class using these abstractions instead of directly using JavaRDD ? 



##########
hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDBucketIndexPartitioner.java:
##########
@@ -18,15 +18,155 @@
 
 package org.apache.hudi.execution.bulkinsert;
 
+import org.apache.avro.Schema;
+import org.apache.hudi.avro.HoodieAvroUtils;
+import org.apache.hudi.common.config.SerializableSchema;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.model.HoodieKey;
 import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.model.HoodieTableType;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.hudi.common.util.collection.FlatLists;
+import org.apache.hudi.io.AppendHandleFactory;
+import org.apache.hudi.io.SingleFileHandleCreateFactory;
+import org.apache.hudi.io.WriteHandleFactory;
 import org.apache.hudi.table.BulkInsertPartitioner;
 
+import org.apache.hudi.table.HoodieTable;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.spark.Partitioner;
 import org.apache.spark.api.java.JavaRDD;
+import scala.Tuple2;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.List;
+
 
 /**
  * Abstract of bucket index bulk_insert partitioner
- * TODO implement partitioner for SIMPLE BUCKET INDEX
  */
 public abstract class RDDBucketIndexPartitioner<T>
     implements BulkInsertPartitioner<JavaRDD<HoodieRecord<T>>> {
+

Review Comment:
   HoodieBucketIndex is defined in Engine agnostic way (uses HoodieData and HoodieEngineContext). Can we also define the base partitioner class using these abstractions instead of directly using JavaRDD ? 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org