You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2020/11/23 20:07:01 UTC

[GitHub] [hudi] vinothchandar commented on a change in pull request #2225: [HUDI-1363] Provide Option to drop partition columns after they are used to generate partition or record keys

vinothchandar commented on a change in pull request #2225:
URL: https://github.com/apache/hudi/pull/2225#discussion_r528958557



##########
File path: hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
##########
@@ -117,6 +134,8 @@ private[hudi] object HoodieSparkSqlWriter {
       }
 
       val commitActionType = DataSourceUtils.getCommitActionType(operation, tableConfig.getTableType)
+      val partition_key = parameters.get(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY).get

Review comment:
       camelCase? 

##########
File path: hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
##########
@@ -55,6 +55,23 @@ private[hudi] object HoodieSparkSqlWriter {
   private var tableExists: Boolean = false
   private var asyncCompactionTriggerFnDefined: Boolean = false
 
+  def generateSchemaWithoutPartitionColumns(partitionParam: String, oldSchema: Schema): Schema = {
+    val fieldsToRemove =  new util.ArrayList[String]()
+    partitionParam.split(",").map(partitionField => partitionField.trim)
+      .filter(s => !s.isEmpty).map(field => fieldsToRemove.add(field))
+    HoodieAvroUtils.removeFields(oldSchema, fieldsToRemove)
+  }
+
+  def generateNewRecordForPartitionColumnsDrop(enableDropPartitionColumns: Boolean,

Review comment:
       could we add the boolean as the third arg. makes it easy to read both the method signatures

##########
File path: hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
##########
@@ -55,6 +55,23 @@ private[hudi] object HoodieSparkSqlWriter {
   private var tableExists: Boolean = false
   private var asyncCompactionTriggerFnDefined: Boolean = false
 
+  def generateSchemaWithoutPartitionColumns(partitionParam: String, oldSchema: Schema): Schema = {
+    val fieldsToRemove =  new util.ArrayList[String]()
+    partitionParam.split(",").map(partitionField => partitionField.trim)
+      .filter(s => !s.isEmpty).map(field => fieldsToRemove.add(field))
+    HoodieAvroUtils.removeFields(oldSchema, fieldsToRemove)
+  }
+
+  def generateNewRecordForPartitionColumnsDrop(enableDropPartitionColumns: Boolean,
+                                               oldRecord: GenericRecord, partitionParam: String): GenericRecord = {
+    var record = oldRecord
+    if (enableDropPartitionColumns) {
+      val newSchema = generateSchemaWithoutPartitionColumns(partitionParam, oldRecord.getSchema)

Review comment:
       doing this every record is going to be expensive.

##########
File path: hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
##########
@@ -117,6 +134,8 @@ private[hudi] object HoodieSparkSqlWriter {
       }
 
       val commitActionType = DataSourceUtils.getCommitActionType(operation, tableConfig.getTableType)
+      val partition_key = parameters.get(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY).get

Review comment:
       this just covers the case when a key generator is not used?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org