You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by "ASF GitHub Bot (Jira)" <ji...@apache.org> on 2023/03/29 13:55:00 UTC

[jira] [Updated] (HUDI-5999) When reading from nonpartitioned bootstrap table, exception occurs because extractPartitionValuesFromPartitionPath is always set to true

     [ https://issues.apache.org/jira/browse/HUDI-5999?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

ASF GitHub Bot updated HUDI-5999:
---------------------------------
    Labels: pull-request-available  (was: )

> When reading from nonpartitioned bootstrap table, exception occurs because extractPartitionValuesFromPartitionPath is always set to true
> ----------------------------------------------------------------------------------------------------------------------------------------
>
>                 Key: HUDI-5999
>                 URL: https://issues.apache.org/jira/browse/HUDI-5999
>             Project: Apache Hudi
>          Issue Type: Bug
>          Components: bootstrap, spark
>            Reporter: Jonathan Vexler
>            Priority: Major
>              Labels: pull-request-available
>
> exception:
> {code:java}
> 23/03/29 09:36:08 WARN HoodieBootstrapRelation: Failed to get the right partition InternalRow for file: DeprecatedRawLocalFileStatus{path=file:/Users/jon/Documents/bootstrap_testing/perf/metadata_perf_tables/date_dim/9a34fde4-8d8c-4de9-ae0c-559b5e44b5b3_0-523-22998_00000000000001.parquet; isDirectory=false; length=575504; replication=1; blocksize=33554432; modification_time=1680048819339; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}
> java.lang.ArrayIndexOutOfBoundsException: 1
> 	at org.apache.spark.sql.execution.datasources.PartitioningUtils$.$anonfun$parsePathFragmentAsSeq$1(PartitioningUtils.scala:346)
> 	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
> 	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
> 	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
> 	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
> 	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
> 	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
> 	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:198)
> 	at org.apache.spark.sql.execution.datasources.PartitioningUtils$.parsePathFragmentAsSeq(PartitioningUtils.scala:344)
> 	at org.apache.spark.sql.execution.datasources.PartitioningUtils$.parsePathFragment(PartitioningUtils.scala:335)
> 	at org.apache.hudi.HoodieBaseRelation.getPartitionColumnsAsInternalRowInternal(HoodieBaseRelation.scala:489)
> 	at org.apache.hudi.HoodieBootstrapRelation.$anonfun$collectFileSplits$1(HoodieBootstrapRelation.scala:76)
> 	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
> 	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
> 	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
> 	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
> 	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
> 	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
> 	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
> 	at org.apache.hudi.HoodieBootstrapRelation.collectFileSplits(HoodieBootstrapRelation.scala:71)
> 	at org.apache.hudi.HoodieBaseRelation.buildScan(HoodieBaseRelation.scala:366)
> 	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.$anonfun$apply$4(DataSourceStrategy.scala:323)
> 	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.$anonfun$pruneFilterProject$1(DataSourceStrategy.scala:357)
> 	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.pruneFilterProjectRaw(DataSourceStrategy.scala:436)
> 	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.pruneFilterProject(DataSourceStrategy.scala:356)
> 	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.apply(DataSourceStrategy.scala:323)
> 	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$1(QueryPlanner.scala:63)
> 	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
> 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
> 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
> 	at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
> 	at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:67)
> 	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78)
> 	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:196)
> 	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:194)
> 	at scala.collection.Iterator.foreach(Iterator.scala:943)
> 	at scala.collection.Iterator.foreach$(Iterator.scala:943)
> 	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
> 	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
> 	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
> 	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
> 	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:75)
> 	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
> 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
> 	at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
> 	at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:67)
> 	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78)
> 	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:196)
> 	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:194)
> 	at scala.collection.Iterator.foreach(Iterator.scala:943)
> 	at scala.collection.Iterator.foreach$(Iterator.scala:943)
> 	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
> 	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
> 	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
> 	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
> 	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:75)
> 	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
> 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
> 	at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
> 	at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:67)
> 	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78)
> 	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:196)
> 	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:194)
> 	at scala.collection.Iterator.foreach(Iterator.scala:943)
> 	at scala.collection.Iterator.foreach$(Iterator.scala:943)
> 	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
> 	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
> 	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
> 	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
> 	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:75)
> 	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
> 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
> 	at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
> 	at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:67)
> 	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78)
> 	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:196)
> 	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:194)
> 	at scala.collection.Iterator.foreach(Iterator.scala:943)
> 	at scala.collection.Iterator.foreach$(Iterator.scala:943)
> 	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
> 	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
> 	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
> 	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
> 	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:75)
> 	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
> 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
> 	at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
> 	at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:67)
> 	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78)
> 	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:196)
> 	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:194)
> 	at scala.collection.Iterator.foreach(Iterator.scala:943)
> 	at scala.collection.Iterator.foreach$(Iterator.scala:943)
> 	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
> 	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
> 	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
> 	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
> 	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:75)
> 	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
> 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
> 	at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
> 	at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:67)
> 	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78)
> 	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:196)
> 	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:194)
> 	at scala.collection.Iterator.foreach(Iterator.scala:943)
> 	at scala.collection.Iterator.foreach$(Iterator.scala:943)
> 	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
> 	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
> 	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
> 	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
> 	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:75)
> 	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
> 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
> 	at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
> 	at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:67)
> 	at org.apache.spark.sql.execution.QueryExecution$.createSparkPlan(QueryExecution.scala:453)
> 	at org.apache.spark.sql.execution.QueryExecution.$anonfun$sparkPlan$1(QueryExecution.scala:144)
> 	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
> 	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:183)
> 	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
> 	at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:183)
> 	at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:144)
> 	at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:137)
> 	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executedPlan$1(QueryExecution.scala:157)
> 	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
> 	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:183)
> 	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
> 	at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:183)
> 	at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:157)
> 	at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:150)
> 	at org.apache.spark.sql.execution.columnar.InMemoryRelation$.apply(InMemoryRelation.scala:325)
> 	at org.apache.spark.sql.execution.CacheManager.$anonfun$cacheQuery$2(CacheManager.scala:124)
> 	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
> 	at org.apache.spark.sql.execution.CacheManager.cacheQuery(CacheManager.scala:119)
> 	at org.apache.spark.sql.execution.CacheManager.cacheQuery(CacheManager.scala:92)
> 	at org.apache.spark.sql.Dataset.persist(Dataset.scala:3171)
> 	at org.apache.spark.sql.Dataset.cache(Dataset.scala:3181) {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)