You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by "ASF GitHub Bot (Jira)" <ji...@apache.org> on 2022/02/10 13:44:00 UTC
[jira] [Updated] (HUDI-3408) fixed the bug that BUCKET_INDEX cannot process special characters
[ https://issues.apache.org/jira/browse/HUDI-3408?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
ASF GitHub Bot updated HUDI-3408:
---------------------------------
Labels: pull-request-available (was: )
> fixed the bug that BUCKET_INDEX cannot process special characters
> -----------------------------------------------------------------
>
> Key: HUDI-3408
> URL: https://issues.apache.org/jira/browse/HUDI-3408
> Project: Apache Hudi
> Issue Type: Bug
> Components: core
> Affects Versions: 0.10.1
> Environment: spark3.1.1
> Reporter: Tao Meng
> Priority: Major
> Labels: pull-request-available
>
> BucketIdentifier use split(":") to split recordKeyName and recordKeyValue, if current recordKeyValue is ":::" , above split give a wrong result.
> test("test Bucket") {
> withTempDir { tmp =>
> Seq("mor").foreach { tableType =>
> val tableName = generateTableName
> val tablePath = s"${new Path(tmp.getCanonicalPath, tableName).toUri.toString}"
> spark.sql(
> s"""
> |create table $tableName (
> | id int, comb int, col0 int, col1 bigint, col2 float, col3 double, col4 decimal(10,4), col5 string, col6 date, col7 timestamp, col8 boolean, col9 binary, par date
> |) using hudi
> | location '$tablePath'
> | options (
> | type = '$tableType',
> | primaryKey = 'id,col0,col5',
> | preCombineField = 'comb',
> | hoodie.index.type = 'BUCKET',
> | hoodie.bucket.index.num.buckets = '900'
> | )
> | partitioned by (par)
> """.stripMargin)
> spark.sql(
> s"""
> | insert into $tableName values
> | (1,1,11,100001,101.01,1001.0001,100001.0001,':::','2021-12-25','2021-12-25 12:01:01',true,'a01','2021-12-25'),
> | (2,2,12,100002,102.02,1002.0002,100002.0002,'a000002','2021-12-25','2021-12-25 12:02:02',true,'a02','2021-12-25'),
> | (3,3,13,100003,103.03,1003.0003,100003.0003,'a000003','2021-12-25','2021-12-25 12:03:03',false,'a03','2021-12-25'),
> | (4,4,14,100004,104.04,1004.0004,100004.0004,'a000004','2021-12-26','2021-12-26 12:04:04',true,'a04','2021-12-26'),
> | (5,5,15,100005,105.05,1005.0005,100005.0005,'a000005','2021-12-26','2021-12-26 12:05:05',false,'a05','2021-12-26')
> |""".stripMargin)
> }
> }
> }
>
> Caused by: java.lang.ArrayIndexOutOfBoundsException: 1
> at org.apache.hudi.index.bucket.BucketIdentifier.lambda$getBucketId$2(BucketIdentifier.java:46)
> at java.util.stream.Collectors.lambda$toMap$58(Collectors.java:1321)
> at java.util.stream.ReduceOps$3ReducingSink.accept(ReduceOps.java:169)
> at java.util.stream.ReferencePipeline$3$1.accept(ReferencePipeline.java:193)
> at java.util.Spliterators$ArraySpliterator.forEachRemaining(Spliterators.java:948)
> at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:481)
> at java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:471)
> at java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708)
> at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234)
> at java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:499)
> at org.apache.hudi.index.bucket.BucketIdentifier.getBucketId(BucketIdentifier.java:46)
> at org.apache.hudi.index.bucket.BucketIdentifier.getBucketId(BucketIdentifier.java:36)
> at org.apache.hudi.index.bucket.SparkBucketIndex$1.computeNext(SparkBucketIndex.java:80)
> at org.apache.hudi.index.bucket.SparkBucketIndex$1.computeNext(SparkBucketIndex.java:70)
> at org.apache.hudi.client.utils.LazyIterableIterator.next(LazyIterableIterator.java:125)
> ... 25 more
>
--
This message was sent by Atlassian Jira
(v8.20.1#820001)