You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2021/02/02 09:08:46 UTC

[GitHub] [hudi] teeyog commented on a change in pull request #2431: [HUDI-1526] Translate the api partitionBy to hoodie.datasource.write.partitionpath.field

teeyog commented on a change in pull request #2431:
URL: https://github.com/apache/hudi/pull/2431#discussion_r568434336



##########
File path: hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala
##########
@@ -348,4 +351,65 @@ class TestCOWDataSource extends HoodieClientTestBase {
 
     assertTrue(HoodieDataSourceHelpers.hasNewCommits(fs, basePath, "000"))
   }
+
+  @Test def testPartitionByTranslateToPartitionPath() {
+    val records = recordsToStrings(dataGen.generateInserts("000", 100)).toList
+    val inputDF = spark.read.json(spark.sparkContext.parallelize(records, 2))
+
+    val noPartitionPathOpts = commonOpts - DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY
+
+    // partitionBy takes effect
+    inputDF.write.format("hudi")
+      .partitionBy("current_date")
+      .options(noPartitionPathOpts)
+      .mode(SaveMode.Overwrite)
+      .save(basePath)
+
+    var recordsReadDF = spark.read.format("org.apache.hudi")
+      .load(basePath + "/*/*")
+
+    assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= col("current_date").cast("string")).count() == 0)
+
+    // PARTITIONPATH_FIELD_OPT_KEY takes effect
+    inputDF.write.format("hudi")
+      .partitionBy("current_date")
+      .options(noPartitionPathOpts)
+      .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "timestamp")
+      .mode(SaveMode.Overwrite)
+      .save(basePath)
+
+    recordsReadDF = spark.read.format("org.apache.hudi")
+      .load(basePath + "/*/*")
+
+    assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= col("timestamp").cast("string")).count() == 0)
+
+    // CustomKeyGenerator with SIMPLE
+    inputDF.write.format("hudi")
+      .partitionBy("current_ts")
+      .options(noPartitionPathOpts)
+      .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY, "org.apache.hudi.keygen.CustomKeyGenerator")
+      .mode(SaveMode.Overwrite)
+      .save(basePath)
+
+    recordsReadDF = spark.read.format("org.apache.hudi")
+      .load(basePath + "/*/*")
+
+    assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= col("current_ts").cast("string")).count() == 0)
+
+    // CustomKeyGenerator with TIMESTAMP
+    inputDF.write.format("hudi")
+      .partitionBy("current_ts")
+      .options(noPartitionPathOpts)
+      .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY, "org.apache.hudi.keygen.CustomKeyGenerator")
+      .option(Config.TIMESTAMP_TYPE_FIELD_PROP, "EPOCHMILLISECONDS")
+      .option(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "yyyyMMdd")
+      .mode(SaveMode.Overwrite)
+      .save(basePath)
+
+    recordsReadDF = spark.read.format("org.apache.hudi")
+      .load(basePath + "/*/*")
+
+    val udf_date_format = udf((data: Long) => new DateTime(data).toString(DateTimeFormat.forPattern("yyyyMMdd")))

Review comment:
       @nsivabalan 
   Thank you for your review, it has been adjusted according to your opinion




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org