You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2022/03/23 08:09:15 UTC

[GitHub] [hudi] CrazyBeeline opened a new issue #5105: cluster with incorrect partition

CrazyBeeline opened a new issue #5105:
URL: https://github.com/apache/hudi/issues/5105


   Steps to reproduce the behavior:
   
   1. extract source from kafka with HoodieDeltaStreamer
   ![image](https://user-images.githubusercontent.com/25030234/159650915-aa4b4676-7fdc-47c5-b1ce-e5a3a511b2cc.png)
   insert_cluster.properties main configure
   
   hoodie.upsert.shuffle.parallelism=100
   hoodie.insert.shuffle.parallelism=100
   hoodie.bulkinsert.shuffle.parallelism=100
   hoodie.delete.shuffle.parallelism=100
   hoodie.rollback.parallelism=100
   hoodie.cleaner.parallelism=100
   
   hoodie.datasource.write.recordkey.field=insert_time,id
   hoodie.datasource.write.partitionpath.field=create_time:TIMESTAMP
   hoodie.datasource.write.precombine.field=insert_time
   
   hoodie.table.base.file.format=PARQUET
   
   hoodie.datasource.write.hive_style_partitioning=true
   hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.CustomKeyGenerator
   
   hoodie.deltastreamer.keygen.timebased.timestamp.type=DATE_STRING
   hoodie.deltastreamer.keygen.timebased.input.dateformat=yyyy/MM/dd
   hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy-MM-dd
   
   
   ##### memory
   #  on heap
   hoodie.memory.merge.fraction=0.6
   hoodie.memory.merge.max.size=1073741824
   
   hoodie.memory.compaction.fraction=0.6
   #hoodie.memory.compaction.max.size=
   
   ###### storage ######
   hoodie.logfile.data.block.max.size=268435456
   hoodie.logfile.max.size=1073741824
   hoodie.parquet.max.file.size=125829120
   hoodie.parquet.small.file.limit=104857600
   #for mor
   hoodie.logfile.to.parquet.compression.ratio=0.35
   
   
   ## kafka source
   hoodie.deltastreamer.source.kafka.topic=hive-kafka-hudi
   bootstrap.servers=hadoop02:9092,hadoop01:9092,hadoop03:9092
   auto.offset.reset=earliest
   
   ##### hudi table ####
   hoodie.database.name=default
   ##### hive sink ########
   hoodie.datasource.hive_sync.database=default
   hoodie.datasource.hive_sync.table=hudi_person_insert_cluster
   hoodie.datasource.hive_sync.username=root
   hoodie.datasource.hive_sync.password=
   hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hadoop03:10000
   hoodie.datasource.hive_sync.partition_fields=create_time
   hoodie.datasource.hive_sync.use_jdbc=false
   hoodie.datasource.hive_sync.support_timestamp=false
   hoodie.datasource.hive_sync.create_managed_table=false
   hoodie.datasource.hive_sync.sync_as_datasource=true
   hoodie.datasource.hive_sync.batch_num=10000
   hoodie.datasource.hive_sync.assume_date_partitioning=false
   hoodie.datasource.hive_sync.bucket_sync=false
   hoodie.datasource.hive_sync.auto_create_database=true
   hoodie.datasource.hive_sync.enable=true
   hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.HiveStylePartitionValueExtractor
   #hoodie.datasource.hive_sync.skip_ro_suffix=true
   #hoodie.datasource.hive_sync.create_managed_table=false
   hoodie.embed.timeline.server=true
   hoodie.deltastreamer.schemaprovider.source.schema.file=file:///opt/software/hudi/schame
   ######### compaction #######
   hoodie.compact.inline=false
   hoodie.compact.inline.max.delta.commits=10
   
   
   ###### clean #####
   hoodie.clean.automatic=true
   ## KEEP_LATEST_FILE_VERSIONS or KEEP_LATEST_COMMITS
   hoodie.cleaner.policy=KEEP_LATEST_COMMITS
   hoodie.cleaner.delete.bootstrap.base.file=true
   hoodie.cleaner.commits.retained=3
   # lazily  for multi-writers
   hoodie.cleaner.policy.failed.writes=EAGER
   
   # for KEEP_LATEST_FILE_VERSIONS
   hoodie.cleaner.fileversions.retained=3
   
   
   ##### clustering #######
   hoodie.clustering.inline=true
   hoodie.clustering.inline.max.commits=10
   
   
   hoodie.clustering.async.enabled=false
   hoodie.clustering.async.max.commits=3
   
   hoodie.clustering.preserve.commit.metadata=true
   
   
   hoodie.clustering.plan.strategy.target.file.max.bytes=133169152
   hoodie.clustering.plan.strategy.small.file.limit=52428800
   hoodie.clustering.plan.strategy.sort.columns=insert_time
   
   hoodie.clustering.plan.strategy.class=org.apache.hudi.client.clustering.plan.strategy.SparkRecentDaysClusteringPlanStrategy
   hoodie.clustering.plan.strategy.daybased.lookback.partitions=1
   hoodie.clustering.plan.strategy.daybased.skipfromlatest.partitions=2
   
   hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy
   
   hoodie.clustering.updates.strategy=org.apache.hudi.client.clustering.update.strategy.SparkRejectUpdateStrategy
   
   #### Multi Writer ######
   #single_writer
   hoodie.write.concurrency.mode=optimistic_concurrency_control
   #EAGER
   hoodie.cleaner.policy.failed.writes=LAZY
   hoodie.write.lock.provider=org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider
   hoodie.write.lock.zookeeper.url=hadoop01,hadoop02,hadoop03
   hoodie.write.lock.zookeeper.port=2181
   hoodie.write.lock.zookeeper.lock_key=hive_kafka_hudi
   hoodie.write.lock.zookeeper.base_path=/hudi_lock
   hoodie.write.lock.zookeeper.connection_timeout_ms=15000
   hoodie.write.lock.zookeeper.session_timeout_ms=60000
   #### commit callback####
   hoodie.write.commit.callback.on=true
   hoodie.write.commit.callback.class=org.apache.hudi.utilities.callback.kafka.HoodieWriteCommitKafkaCallback
   hoodie.write.commit.callback.kafka.bootstrap.servers=hadoop02:9092,hadoop01:9092,hadoop03:9092
   hoodie.write.commit.callback.kafka.topic=hudi_commit_callback
   #
   #hoodie.write.commit.callback.kafka.partition=
   hoodie.write.commit.callback.kafka.acks=all
   hoodie.write.commit.callback.kafka.retries=3
   
   #### metadata ####
   hoodie.metadata.clean.async=false
   hoodie.metadata.cleaner.commits.retained=3
   hoodie.metadata.compact.max.delta.commits=10
   hoodie.metadata.keep.max.commits=30
   hoodie.metadata.keep.min.commits=20
   hoodie.commits.archival.batch=10
   ##### archive
   hoodie.archive.automatic=true
   hoodie.archivelog.folder=archived
   hoodie.archive.delete.parallelism=10
   
   2.I HAVE three partitions 
   ![image](https://user-images.githubusercontent.com/25030234/159651430-931ec360-cf14-4f9c-a98a-2f46ae22d401.png)
   3.
   Based on the above configuration, I only know that the partition of cluster is create_time=2021-03-08
   But actually all partitions will cluster
   4. I did a test
   hoodie.clustering.inline=false
   partition create_time=2021-03-08 Other partitions are similar
   ![image](https://user-images.githubusercontent.com/25030234/159652181-162b3bed-d11b-40bd-aab5-7c809c89714d.png)
   hoodie.clustering.inline=true
   partition create_time=2021-03-08  have a cluster ops
   ![image](https://user-images.githubusercontent.com/25030234/159652259-4228998f-89cf-4303-8c62-e46deeb426a9.png)
   partition create_time=2021-03-09 also have a cluster ops
   ![image](https://user-images.githubusercontent.com/25030234/159652352-bbb84ded-5865-43e0-9164-ace9af058c49.png)
   partition create_time=2021-03-10 also have a cluster ops
   ![image](https://user-images.githubusercontent.com/25030234/159652431-ce9410fb-df18-4476-bc61-aa8727dbcbaf.png)
   
   
   
   **Environment Description**
   
   * Hudi version :0.10.1
   
   * Spark version :3.1.3
   
   * Hive version :3.1.2
   
   * Hadoop version :3.2.2
   
   * Storage (HDFS/S3/GCS..) :HDFS
   
   * Running on Docker? (yes/no) :no
   
   
   **Additional context**
   
   Add any other context about the problem here.
   
   **Stacktrace**
   
   ```Add the stacktrace of the error.```
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [hudi] CrazyBeeline commented on issue #5105: cluster with incorrect partition

Posted by GitBox <gi...@apache.org>.
CrazyBeeline commented on issue #5105:
URL: https://github.com/apache/hudi/issues/5105#issuecomment-1076063686


   By the way, using this configuration will stop the HoodieDeltaStreamer task
   --------
   ![image](https://user-images.githubusercontent.com/25030234/159653242-06241134-fc8f-4680-bfab-6eab0b24946c.png)
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [hudi] nsivabalan commented on issue #5105: cluster with incorrect partition

Posted by GitBox <gi...@apache.org>.
nsivabalan commented on issue #5105:
URL: https://github.com/apache/hudi/issues/5105#issuecomment-1079817651


   @suryaprasanna : Can you please assist here. 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org