You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by "Ethan Guo (Jira)" <ji...@apache.org> on 2023/02/16 06:08:00 UTC
[jira] [Updated] (HUDI-4783) Hive-style partition path ("partition=value") does not work with bootstrap
[ https://issues.apache.org/jira/browse/HUDI-4783?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Ethan Guo updated HUDI-4783:
----------------------------
Fix Version/s: 0.12.1
(was: 0.13.0)
> Hive-style partition path ("partition=value") does not work with bootstrap
> --------------------------------------------------------------------------
>
> Key: HUDI-4783
> URL: https://issues.apache.org/jira/browse/HUDI-4783
> Project: Apache Hudi
> Issue Type: Bug
> Affects Versions: 0.12.0
> Reporter: Ethan Guo
> Assignee: Ethan Guo
> Priority: Major
> Fix For: 0.12.1
>
>
> Create partitioned parquet table, generating partition path like "partition=2022%2F1%2F24". Bootstrap operation does not generate expected partitions, only generating "__HIVE_DEFAULT_PARTITION__" partition in the target Hudi table.
> {code:java}
> val srcPath = "<>/bootstrap-testing/partitioned-parquet-table"
> val basePath = "<>/bootstrap-testing/bootstrap-hudi-table-1"
> val bootstrapDF = spark.emptyDataFrame
> bootstrapDF.write
> .format("hudi")
> .option(HoodieWriteConfig.TABLE_NAME, "hoodie_test")
> .option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL)
> .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "key")
> .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "partition")
> .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "ts")
> .option(HoodieBootstrapConfig.BOOTSTRAP_BASE_PATH_PROP, srcPath)
> .option(HoodieBootstrapConfig.BOOTSTRAP_KEYGEN_CLASS, classOf[SimpleKeyGenerator].getName)
> .option(HoodieBootstrapConfig.BOOTSTRAP_MODE_SELECTOR, classOf[BootstrapRegexModeSelector].getName)
> .option(HoodieBootstrapConfig.BOOTSTRAP_MODE_SELECTOR_REGEX, "2022/1/2[4-8]")
> .option(HoodieBootstrapConfig.BOOTSTRAP_MODE_SELECTOR_REGEX_MODE, "METADATA_ONLY")
> .option(HoodieBootstrapConfig.FULL_BOOTSTRAP_INPUT_PROVIDER, classOf[SparkParquetBootstrapDataProvider].getName)
> .mode(SaveMode.Overwrite)
> .save(basePath) {code}
> Querying the target table using Spark datasource throws IAE:
> {code:java}
> spark.read.format("hudi").load("<>/bootstrap-testing/bootstrap-hudi-table-1")
> java.lang.IllegalArgumentException: Cannot find columns: 'partition' in the schema[StructField(_hoodie_commit_time,StringType,true),StructField(_hoodie_commit_seqno,StringType,true),StructField(_hoodie_record_key,StringType,true),StructField(_hoodie_partition_path,StringType,true),StructField(_hoodie_file_name,StringType,true),StructField(key,StringType,true),StructField(ts,LongType,true),StructField(textField,StringType,true),StructField(decimalField,FloatType,true),StructField(longField,LongType,true),StructField(arrayField,ArrayType(IntegerType,true),true),StructField(mapField,MapType(StringType,IntegerType,true),true),StructField(round,IntegerType,true)]
> at org.apache.hudi.SparkHoodieTableFileIndex._partitionSchemaFromProperties$lzycompute(SparkHoodieTableFileIndex.scala:113)
> at org.apache.hudi.SparkHoodieTableFileIndex._partitionSchemaFromProperties(SparkHoodieTableFileIndex.scala:87)
> at org.apache.hudi.SparkHoodieTableFileIndex.partitionSchema(SparkHoodieTableFileIndex.scala:147)
> at org.apache.hudi.SparkHoodieTableFileIndex.parsePartitionColumnValues(SparkHoodieTableFileIndex.scala:258)
> at org.apache.hudi.BaseHoodieTableFileIndex.lambda$getAllQueryPartitionPaths$3(BaseHoodieTableFileIndex.java:190)
> at java.util.stream.ReferencePipeline$3$1.accept(ReferencePipeline.java:193)
> at java.util.ArrayList$ArrayListSpliterator.forEachRemaining(ArrayList.java:1384)
> at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482)
> at java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472)
> at java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708)
> at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234)
> at java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:566)
> at org.apache.hudi.BaseHoodieTableFileIndex.getAllQueryPartitionPaths(BaseHoodieTableFileIndex.java:193)
> at org.apache.hudi.BaseHoodieTableFileIndex.loadPartitionPathFiles(BaseHoodieTableFileIndex.java:225)
> at org.apache.hudi.BaseHoodieTableFileIndex.doRefresh(BaseHoodieTableFileIndex.java:270)
> at org.apache.hudi.BaseHoodieTableFileIndex.<init>(BaseHoodieTableFileIndex.java:140)
> at org.apache.hudi.SparkHoodieTableFileIndex.<init>(SparkHoodieTableFileIndex.scala:69)
> at org.apache.hudi.HoodieFileIndex.<init>(HoodieFileIndex.scala:80)
> at org.apache.hudi.HoodieBootstrapRelation.buildFileIndex(HoodieBootstrapRelation.scala:165)
> at org.apache.hudi.HoodieBootstrapRelation.<init>(HoodieBootstrapRelation.scala:66)
> at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:144)
> at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:68)
> at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:350)
> at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:274)
> at org.apache.spark.sql.DataFrameReader.$anonfun$load$3(DataFrameReader.scala:245)
> at scala.Option.getOrElse(Option.scala:189)
> at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:245)
> at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:188)
> ... 47 elided {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)