You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by "Ethan Guo (Jira)" <ji...@apache.org> on 2022/09/06 05:22:00 UTC

[jira] [Created] (HUDI-4785) Cannot find partition column when querying bootstrapped table in Spark

Ethan Guo created HUDI-4785:
-------------------------------

             Summary: Cannot find partition column when querying bootstrapped table in Spark
                 Key: HUDI-4785
                 URL: https://issues.apache.org/jira/browse/HUDI-4785
             Project: Apache Hudi
          Issue Type: Bug
            Reporter: Ethan Guo


Bootstrap table:

{code:java}
val srcPath = "<>/bootstrap-testing/partitioned-parquet-table-date"
val basePath = "<>/bootstrap-testing/bootstrap-hudi-table-2"
val bootstrapDF = spark.emptyDataFrame
bootstrapDF.write
      .format("hudi")
      .option(HoodieWriteConfig.TABLE_NAME, "hoodie_test")
      .option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL)
      .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "key")
      .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "partition")
      .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "ts")
      .option(HoodieBootstrapConfig.BOOTSTRAP_BASE_PATH_PROP, srcPath)
      .option(HoodieBootstrapConfig.BOOTSTRAP_KEYGEN_CLASS, classOf[SimpleKeyGenerator].getName)
      .option(HoodieBootstrapConfig.BOOTSTRAP_MODE_SELECTOR, classOf[BootstrapRegexModeSelector].getName)
      .option(HoodieBootstrapConfig.BOOTSTRAP_MODE_SELECTOR_REGEX, "2022/1/2[4-8]")
      .option(HoodieBootstrapConfig.BOOTSTRAP_MODE_SELECTOR_REGEX_MODE, "METADATA_ONLY")
      .option(HoodieBootstrapConfig.FULL_BOOTSTRAP_INPUT_PROVIDER, classOf[SparkParquetBootstrapDataProvider].getName)
      .mode(SaveMode.Overwrite)
      .save(basePath) {code}
 
{code:java}
scala> spark.sql("select partition, _hoodie_partition_path, count(*) from test_table group by partition, _hoodie_partition_path ")
org.apache.spark.sql.AnalysisException: cannot resolve 'partition' given input columns: [test_table._hoodie_commit_seqno, test_table._hoodie_commit_time, test_table._hoodie_file_name, test_table._hoodie_partition_path, test_table._hoodie_record_key, test_table.arrayField, test_table.decimalField, test_table.key, test_table.longField, test_table.mapField, test_table.round, test_table.textField, test_table.ts]; line 1 pos 76;
'Aggregate ['partition, _hoodie_partition_path#912], ['partition, _hoodie_partition_path#912, count(1) AS count(1)#956L]
+- SubqueryAlias test_table
   +- View (`test_table`, [_hoodie_commit_time#909,_hoodie_commit_seqno#910,_hoodie_record_key#911,_hoodie_partition_path#912,_hoodie_file_name#913,key#914,ts#915L,textField#916,decimalField#917,longField#918L,arrayField#919,mapField#920,round#921])
      +- Relation [_hoodie_commit_time#909,_hoodie_commit_seqno#910,_hoodie_record_key#911,_hoodie_partition_path#912,_hoodie_file_name#913,key#914,ts#915L,textField#916,decimalField#917,longField#918L,arrayField#919,mapField#920,round#921] org.apache.hudi.HoodieBootstrapRelation@2daee5f


  at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:54)
  at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$$nestedInanonfun$checkAnalysis$1$2.applyOrElse(CheckAnalysis.scala:179)
  at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$$nestedInanonfun$checkAnalysis$1$2.applyOrElse(CheckAnalysis.scala:175)
  at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformUpWithPruning$2(TreeNode.scala:535)
  at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:82)
  at org.apache.spark.sql.catalyst.trees.TreeNode.transformUpWithPruning(TreeNode.scala:535)
  at org.apache.spark.sql.catalyst.plans.QueryPlan.$anonfun$transformExpressionsUpWithPruning$1(QueryPlan.scala:181)
  at org.apache.spark.sql.catalyst.plans.QueryPlan.$anonfun$mapExpressions$1(QueryPlan.scala:193)
  at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:82)
  at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpression$1(QueryPlan.scala:193)
  at org.apache.spark.sql.catalyst.plans.QueryPlan.recursiveTransform$1(QueryPlan.scala:204)
  at org.apache.spark.sql.catalyst.plans.QueryPlan.$anonfun$mapExpressions$3(QueryPlan.scala:209)
  at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
  at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
  at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
  at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
  at scala.collection.TraversableLike.map(TraversableLike.scala:286)
  at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
  at scala.collection.AbstractTraversable.map(Traversable.scala:108)
  at org.apache.spark.sql.catalyst.plans.QueryPlan.recursiveTransform$1(QueryPlan.scala:209)
  at org.apache.spark.sql.catalyst.plans.QueryPlan.$anonfun$mapExpressions$4(QueryPlan.scala:214)
  at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:323)
  at org.apache.spark.sql.catalyst.plans.QueryPlan.mapExpressions(QueryPlan.scala:214)
  at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionsUpWithPruning(QueryPlan.scala:181)
  at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionsUp(QueryPlan.scala:161)
  at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis$1(CheckAnalysis.scala:175)
  at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis$1$adapted(CheckAnalysis.scala:94)
  at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:263)
  at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.checkAnalysis(CheckAnalysis.scala:94)
  at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.checkAnalysis$(CheckAnalysis.scala:91)
  at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:182)
  at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$executeAndCheck$1(Analyzer.scala:205)
  at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:330)
  at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:202)
  at org.apache.spark.sql.execution.QueryExecution.$anonfun$analyzed$1(QueryExecution.scala:88)
  at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
  at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:196)
  at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
  at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:196)
  at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:88)
  at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:86)
  at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:78)
  at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:98)
  at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
  at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:96)
  at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:618)
  at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
  at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:613)
  ... 47 elided {code}
 



--
This message was sent by Atlassian Jira
(v8.20.10#820010)