You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2022/01/17 06:19:14 UTC

[GitHub] [hudi] xiarixiaoyao edited a comment on issue #4609: [SUPPORT] Got exception while using clustering with z-order

xiarixiaoyao edited a comment on issue #4609:
URL: https://github.com/apache/hudi/issues/4609#issuecomment-1014176723


   @ravs11  sorry for that i cannot reproduce the exception, 
   env: spark3.1.1 , hadoop 3.1.1, parquet 1.12.2
   could you pls help me check me code:
       val path1 = new Path("/tmp/default/clustering/")
    
       // create parquet file 
       spark.sql(
         s"""
            |create table table1 (
            |product_id INT,
            |product_name STRING,
            |product_category STRING,
            |create_time BIGINT,
            |utc_date STRING
            |) using parquet
            | location '${new Path(path1, "p1").toString}'
            | partitioned by (utc_date)
                """.stripMargin)
       spark.sql(
         s"""
            | insert into table1 values
            | (123, 'laptop1' , 'electronics1', 1671881778000, '2021-12-24'),
            | (124, 'laptop2' , 'electronics2', 1671881778000, '2021-12-25'),
            | (125, 'laptop3' , 'electronics3', 1671881778000, '2021-12-24'),
            | (126, 'laptop4' , 'electronics4', 1671881778000, '2021-12-25'),
            | (127, 'laptop5' , 'electronics5', 1671881778000, '2021-12-24')
            |""".stripMargin)
   
       spark.sql(
         s"""
            | insert into table1 values
            | (451, 'tshirt1' , 'mens wear1', 1671968178000, '2021-12-24'),
            | (452, 'tshirt2' , 'mens wear2', 1671968178000, '2021-12-25'),
            | (453, 'tshirt3' , 'mens wear3', 1671968178000, '2021-12-24'),
            | (454, 'tshirt4' , 'mens wear4', 1671968178000, '2021-12-25'),
            | (455, 'tshirt5' , 'mens wear5', 1671968178000, '2021-12-24')
            |""".stripMargin)
   
       spark.sql(
         s"""
            | insert into table1 values
            | (551, 'ts1' , 'wear1', 1671968178000, '2021-12-24'),
            | (552, 'ts2' , 'wear2', 1671968178000, '2021-12-25'),
            | (553, 'ts3' , 'wear3', 1671968178000, '2021-12-24'),
            | (554, 'ts4' , 'wear4', 1671968178000, '2021-12-25'),
            | (555, 'ts5' , 'wear5', 1671968178000, '2021-12-24')
            |""".stripMargin)
       // bulK_insert with z-order
       Seq("2021-12-24", "2021-12-25").foreach { utcDate =>
         val dfx = spark.read.parquet(s"/tmp/default/clustering/p1/utc_date=$utcDate")
           .withColumn("utc_date", lit(utcDate))
         val savePath = s"/tmp/default/clustering/hudi_z_order"
         dfx.write.format("org.apache.hudi")
           .option("hoodie.table.name", s"hudi_z_order_test")
           .option("hoodie.datasource.write.table.name", s"hudi_z_order_test")
           .option("hoodie.datasource.write.operation", "bulk_insert")
           .option("hoodie.sql.insert.mode", "non-strict")
           .option("hoodie.datasource.write.precombine.field", "create_time")
           .option("hoodie.datasource.write.recordkey.field", "product_id")
           .option("hoodie.datasource.write.partitionpath.field", "utc_date")
           .option("hoodie.datasource.write.keygenerator.class", "org.apache.hudi.keygen.SimpleKeyGenerator")
           .option("hoodie.datasource.write.hive_style_partitioning", "true")
           .option("hoodie.bulkinsert.shuffle.parallelism", "3")
           .option("hoodie.bulkinsert.sort.mode", "NONE")
           .option("hoodie.embed.timeline.server", "false")
           .option("hoodie.parquet.compression.codec", "zstd")
           .option("hoodie.clustering.inline", "true")
           .option("hoodie.clustering.inline.max.commits", "1")
           .option("hoodie.clustering.plan.strategy.target.file.max.bytes", "1073741824")
           .option("hoodie.clustering.plan.strategy.small.file.limit", "536870912")
           .option("hoodie.clustering.plan.strategy.sort.columns", "product_name,product_category")
           .option("hoodie.clustering.plan.strategy.max.bytes.per.group", Long.MaxValue.toString)
           .option("hoodie.layout.optimize.enable", "true")
           .option("hoodie.layout.optimize.strategy", "z-order")
           .mode(SaveMode.Append).save(savePath)
       }


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org