You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2022/07/11 09:38:52 UTC

[GitHub] [hudi] jiezi2026 commented on issue #6070: [SUPPORT]'hoodie.datasource.write.hive_style_partitioning':'true' does not take effect in hudi-0.11.1 & spark 3.2.1

jiezi2026 commented on issue #6070:
URL: https://github.com/apache/hudi/issues/6070#issuecomment-1180180328

   > 
   In order to reproduce my problem, I made a small case.
   In our scenario, we first use sqoop to extract MySQL data into a hive table TEMP.temp_hudi_s1.Then use pyspark to load the data of TEMP.temp_hudi_s1 table into TEMP.temp_hudi_t1 table to complete data initialization.
   
   -----------------------------------[sparksql]----------------------------
   CREATE database if NOT EXISTS TEMP COMMENT 'temp' location 'hdfs://csbigdata/DATACENTER/TEMP/';
   
   
   DROP TABLE if exists TEMP.temp_hudi_s1;
   CREATE TABLE if not exists  TEMP.temp_hudi_s1 (
   id bigint comment '地址ID',
   ship_no string comment '发货单号',
   created_date string comment '创建时间');
   
   
   
   insert overwrite table TEMP.temp_hudi_s1 
         select 1  id,'FDP2203170007332' ship_no,'2022-03-17 01:44:15.0' created_date
   union select 2  id,'FDP2203170009040' ship_no,'2022-03-17 01:44:52.0' created_date
   union select 3  id,'FDP2203230005068' ship_no,'2022-03-23 21:35:11.0' created_date
   union select 4  id,'FDP2203250001605' ship_no,'2022-03-25 19:45:50.0' created_date
   union select 5  id,'FDP2203250009052' ship_no,'2022-03-25 19:46:11.0' created_date
   union select 6  id,'FDP2203280007475' ship_no,'2022-03-28 10:23:05.0' created_date
   union select 7  id,'FDP2203280003714' ship_no,'2022-03-28 16:46:52.0' created_date
   union select 8  id,'FDP2203280004322' ship_no,'2022-03-28 16:47:52.0' created_date
   union select 9  id,'FDP2203290007834' ship_no,'2022-03-29 09:40:13.0' created_date
   union select 10 id,'FDP2203290005863' ship_no,'2022-03-29 11:03:48.0' created_date
   
   
   drop TABLE if exists TEMP.temp_hudi_t1;
   CREATE TABLE if not exists  TEMP.temp_hudi_t1 (
   id bigint comment '地址ID',
   ship_no string comment '发货单号',
   created_date string comment '创建时间',
   bi___precombine___ts bigint COMMENT '预组合键',
   create_date string comment '创建日期' )
   using hudi
   tblproperties (type = 'mor', primaryKey = 'id', preCombineField = 'bi___precombine___ts')
   options (
       "hoodie.table.keygenerator.class"="org.apache.hudi.keygen.ComplexKeyGenerator"
   )
   COMMENT '发货地址表'
   PARTITIONED BY( create_date )
   LOCATION 'hdfs://csbigdata/DATACENTER/TEMP/temp_hudi_t1';
   -----------------------------------[sparksql]----------------------------
   
   Next, start a pyspark client:
   
   /opt/apache/SPARK/SPARK-CURRENT/bin/pyspark \
   --conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=/usr/bin/python3 \
   --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer'
   
   ---------------------------[pyspark]----------------------------
   tableName = "temp_hudi_t1"
   basePath = "hdfs://csbigdata/DATACENTER/TEMP/temp_hudi_t1"
   #主键的大小写要和hive查询出来的schema中的大小写保持一致
   keys="id" 
   target_hive_db="TEMP"
   target_hive_table="temp_hudi_t1"
   #分区字段和主键的大小写要保持一致
   partitionId="create_date"
   precombineField="bi___precombine___ts"
   
   source_sql="select *  \
   , nvl(unix_timestamp(cast( created_date as string),'yyyy-MM-dd HH:mm:ss.S'),1 ) * 1000 as bi___precombine___ts  \
   , cast(to_date( created_date ) as string) create_date  \
   from  TEMP.temp_hudi_s1 "
   
   hive_source_data=spark.sql(source_sql);
   
   
   hudi_options = {
       'hoodie.table.name': tableName,
       'hoodie.datasource.write.recordkey.field': keys,
       'hoodie.datasource.write.keygenerator.class':'org.apache.hudi.keygen.ComplexKeyGenerator',
       'hoodie.datasource.write.partitionpath.field': partitionId,
       'hoodie.parquet.writelegacyformat.enabled': 'true',
       'hoodie.datasource.write.table.name': tableName,
       'hoodie.table.timeline.timezone': 'LOCAL',
       'hoodie.datasource.write.operation': 'BULK_INSERT',
       'hoodie.bulkinsert.sort.mode': 'PARTITION_SORT',
       'hoodie.bulkinsert.user.defined.partitioner.sort.columns':partitionId,
       'hoodie.datasource.write.precombine.field': precombineField,
       'hoodie.bulkinsert.shuffle.parallelism': 200,
       'hoodie.upsert.shuffle.parallelism': 200,
       'hoodie.insert.shuffle.parallelism': 200,
       'hoodie.index.type': 'BLOOM',
       'hoodie.datasource.write.hive_style_partitioning':'true',
       'hoodie.clean.automatic':'true',
       'hoodie.cleaner.policy':'KEEP_LATEST_COMMITS',
       'hoodie.cleaner.commits.retained':'10',
       'hoodie.archive.merge.enable':'true',
       'hoodie.archive.automatic':'true',
       'hoodie.archive.merge.files.batch.size':'10',
       'archive.min_commits':'20',
       'archive.max_commits':'30',
       'hoodie.keep.min.commits':'20',
       'hoodie.keep.max.commits':'30'
   }
   
   HUDI_SOURCE_COUNT=hive_source_data.count()
   print("HUDI_SOURCE_COUNT:"+str(HUDI_SOURCE_COUNT))
   hive_source_data.write.format("hudi")  \
       .options(**hudi_options)           \
       .mode("append")                 \
       .save(basePath)
   
   ---------------------------[pyspark]----------------------------
   
   hdfs dfs -ls hdfs://csbigdata/DATACENTER/TEMP/temp_hudi_t1
   ![image](https://user-images.githubusercontent.com/98273236/178235643-19271b29-c755-41b4-970f-7f3591b8ff5f.png)
   
   
   ---------------------------------------------------------
   It is normal to test the following sparksql.
   
   set hoodie.sql.bulk.insert.enable=true;
   set hoodie.sql.insert.mode=non-strict;
   INSERT into TEMP.temp_hudi_t1 
   select id,ship_no,created_date,
   nvl(unix_timestamp(cast( created_date as string),'yyyy-MM-dd HH:mm:ss.S'),1 ) * 1000 as bi___precombine___ts
   , cast(to_date( created_date ) as string) create_date
   from TEMP.temp_hudi_s1
   
   ![image](https://user-images.githubusercontent.com/98273236/178235744-bd4c4c68-97ce-4d53-bbe6-6991e7ef00d8.png)
   
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org