You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@hudi.apache.org by "soumilshah1995 (via GitHub)" <gi...@apache.org> on 2023/03/16 13:14:13 UTC

[GitHub] [hudi] soumilshah1995 opened a new issue, #8207: [SUPPORT] Hudi 0.13 Consistent Hashing Issue for MOR Tables

soumilshah1995 opened a new issue, #8207:
URL: https://github.com/apache/hudi/issues/8207

   
   i am trying to implement consistent hashing and example code given on website on release notes 
   
   ```
   """"
   :Consistent Hashing
   Hudi supports Upsert operation to de-duplicate records in a table, which depends on indexing schemes to perform record location lookup. Among many index options, bucket index (in progress, RFC-29) achieves promising Upsert performance, around ~3x improvement on throughput compared to using Bloom Filter. However, it requires pre-configure a fixed bucket number and cannot be changed afterwards. Combined with the design of one-one mapping between hash buckets and file groups, hudi tables with bucket index have some practical issues, such as data skew and unlimited file group size, which now can only be resolved by resetting a suitable bucket number through re-writing the whole table.
   Problems can be solved by introducing Consistent Hashing Index. It achieves bucket resizing by splitting or merging several local buckets (i.e., only large file groups) while leaving most buckets untouched. This feature allows us to adjust bucket number dynamically in a background service with minimal impacts on downstream systems relying on Hudi. For example, concurrent readers and writers are not blocked during the resizing.
   
   """
   
   try:
   
       import os
       import sys
       import uuid
   
       import pyspark
       from pyspark.sql import SparkSession
       from pyspark import SparkConf, SparkContext
       from pyspark.sql.functions import col, asc, desc
       from pyspark.sql.functions import col, to_timestamp, monotonically_increasing_id, to_date, when
       from pyspark.sql.functions import *
       from pyspark.sql.types import *
       from datetime import datetime
       from functools import reduce
       from faker import Faker
       import datetime
   
   except Exception as e:
       pass
   
   
   SUBMIT_ARGS = "--packages org.apache.hudi:hudi-spark3.3-bundle_2.12:0.13.0 pyspark-shell"
   os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS
   os.environ['PYSPARK_PYTHON'] = sys.executable
   os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
   
   
   
   spark = SparkSession.builder \
       .config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer') \
       .config('className', 'org.apache.hudi') \
       .config('spark.sql.hive.convertMetastoreParquet', 'false') \
       .getOrCreate()
   
   
   
   import faker
   import uuid
   
   global faker
   faker = Faker()
   
   class DataGenerator(object):
   
       @staticmethod
       def get_data(samples):
           return [
               (
                   uuid.uuid4().__str__(),
                   faker.name(),
                   faker.random_element(elements=('IT', 'HR', 'Sales', 'Marketing')),
                   faker.random_element(elements=('CA', 'NY', 'TX', 'FL', 'IL', 'RJ')),
                   str(faker.random_int(min=10000, max=150000)),
                   str(faker.random_int(min=18, max=60)),
                   str(faker.random_int(min=0, max=100000)),
                   str(faker.unix_time()),
                   faker.email(),
                   faker.credit_card_number(card_type='amex'),
                   faker.year(),
                   faker.month()
   
               ) for x in range(samples)
           ]
   
   
   db_name = "hudidb"
   table_name = "hudi_bucket_consistent_hasing_index"
   
   recordkey = 'emp_id,state'
   path = f"file:///C:/tmp/{db_name}/{table_name}"
   precombine = "ts"
   method = 'upsert'
   table_type = "MERGE_ON_READ"
   BUCKET_INDEX_HASH_FEILD = 'state'
   PARTITION_FIELD = 'year'
   
   hudi_options = {
       'hoodie.table.name': table_name,
       'hoodie.datasource.write.table.type': table_type,
       'hoodie.datasource.write.recordkey.field': recordkey,
       'hoodie.datasource.write.table.name': table_name,
       'hoodie.datasource.write.operation': method,
       'hoodie.datasource.write.precombine.field': precombine
   
   
       ,"hoodie.index.type":"BUCKET"
       ,"hoodie.index.bucket.engine" : 'CONSISTENT_HASHING'
       ,'hoodie.bucket.index.max.num.buckets':128
       ,'hoodie.bucket.index.min.num.buckets':32
       ,"hoodie.bucket.index.num.buckets":4
   
       ## do split if the bucket size reach 1.5 * max_file_size
       ,"hoodie.bucket.index.split.threshold":1.5
       ## do merge if the bucket size smaller than 0.2 * max_file_size
       ,"hoodie.bucket.index.merge.threshold": 0.1
       ,"hoodie.datasource.write.partitionpath.field":PARTITION_FIELD
   
   
       ,"hoodie.clustering.inline":"true"
       ,"hoodie.clustering.inline.max.commit":2
       ,"hoodie.clustering.inline.max.commits":2
       ,"hoodie.clustering.plan.strategy.target.file.max.bytes": "1073741824"
       ,"hoodie.clustering.plan.strategy.small.file.limit":"629145600"
       ,"hoodie.clustering.plan.strategy.class":"org.apache.hudi.client.clustering.plan.strategy.SparkConsistentBucketClusteringPlanStrategy"
       ,"hoodie.clustering.execution.strategy.class":"org.apache.hudi.client.clustering.run.strategy.SparkConsistentBucketClusteringExecutionStrategy"
       ,"hoodie.clustering.updates.strategy":"org.apache.hudi.client.clustering.update.strategy.SparkConsistentBucketDuplicateUpdateStrategy"
   
   
   
       ,"hoodie.clean.automatic": "true"
       , "hoodie.clean.async": "true"
       , "hoodie.cleaner.policy": 'KEEP_LATEST_FILE_VERSIONS'
       , "hoodie.cleaner.fileversions.retained": "3"
       , "hoodie-conf hoodie.cleaner.parallelism": '200'
       , 'hoodie.cleaner.commits.retained': 2
   
   }
   
   
   data = DataGenerator.get_data(1000)
   columns = ["emp_id", "employee_name", "department", "state", "salary", "age", "bonus", "ts", "email", "credit_card","year", "month"]
   spark_df = spark.createDataFrame(data=data, schema=columns)
   
   
   start = datetime.datetime.now()
   spark_df.write.format("hudi"). \
       options(**hudi_options). \
       mode("append"). \
       save(path)
   end = datetime.datetime.now()
   print(f"Execution Time {end-start}")
   """"
   
   ### Consistent Hashing Index is still an evolving feature and currently there are some limitations to use it as of 0.13.0:
   
   ###### This index is supported only for Spark engine using a MOR table.
   * It does not work with metadata table enabled.
   * To scale up or shrink the buckets, users have to manually trigger clustering using above configs (at some cadence), but they cannot have compaction concurrently running.
   
   So, if compaction is enabled with your regular write pipeline, please follow this recommendation: You can choose to trigger the scale/shrink once every 12 hours. In such cases, once every 12 hours, you might need to disable compaction, stop your write pipeline and enable clustering. You should take extreme care to not run both concurrently because it might result in conflicts and a failed pipeline. Once clustering is complete, you can resume your regular write pipeline, which will have compaction enabled.
   """
   
   ```
   
   #### Code works fine if i ran it once i ran the same code again it throws error 
   ```
   
   ---------------------------------------------------------------------------
   Py4JJavaError                             Traceback (most recent call last)
   <timed exec> in <module>
   
   ~\Anaconda3\lib\site-packages\pyspark\sql\readwriter.py in save(self, path, format, mode, partitionBy, **options)
       966             self._jwrite.save()
       967         else:
   --> 968             self._jwrite.save(path)
       969 
       970     @since(1.4)
   
   ~\Anaconda3\lib\site-packages\py4j\java_gateway.py in __call__(self, *args)
      1319 
      1320         answer = self.gateway_client.send_command(command)
   -> 1321         return_value = get_return_value(
      1322             answer, self.gateway_client, self.target_id, self.name)
      1323 
   
   ~\Anaconda3\lib\site-packages\pyspark\sql\utils.py in deco(*a, **kw)
       188     def deco(*a: Any, **kw: Any) -> Any:
       189         try:
   --> 190             return f(*a, **kw)
       191         except Py4JJavaError as e:
       192             converted = convert_exception(e.java_exception)
   
   ~\Anaconda3\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
       324             value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
       325             if answer[1] == REFERENCE_TYPE:
   --> 326                 raise Py4JJavaError(
       327                     "An error occurred while calling {0}{1}{2}.\n".
       328                     format(target_id, ".", name), value)
   
   Py4JJavaError: An error occurred while calling o129.save.
   : java.util.concurrent.CompletionException: org.apache.hudi.exception.HoodieClusteringException: Not implement yet
   	at java.base/java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:315)
   	at java.base/java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:320)
   	at java.base/java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1770)
   	at java.base/java.util.concurrent.CompletableFuture$AsyncSupply.exec(CompletableFuture.java:1760)
   	at java.base/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:387)
   	at java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1311)
   	at java.base/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1840)
   	at java.base/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1806)
   	at java.base/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:177)
   Caused by: org.apache.hudi.exception.HoodieClusteringException: Not implement yet
   	at org.apache.hudi.client.clustering.run.strategy.SparkConsistentBucketClusteringExecutionStrategy.performClusteringWithRecordsAsRow(SparkConsistentBucketClusteringExecutionStrategy.java:67)
   	at org.apache.hudi.client.clustering.run.strategy.MultipleSparkJobExecutionStrategy.lambda$runClusteringForGroupAsyncAsRow$6(MultipleSparkJobExecutionStrategy.java:249)
   	at java.base/java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1768)
   	... 6 more
   
   ```
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org