You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by "bvaradar (via GitHub)" <gi...@apache.org> on 2023/04/11 05:29:04 UTC

[GitHub] [hudi] bvaradar commented on a diff in pull request #8303: [HUDI-5998] Speed up reads from bootstrapped tables in spark

bvaradar commented on code in PR #8303:
URL: https://github.com/apache/hudi/pull/8303#discussion_r1162253186


##########
docker/demo/sparksql-batch2.commands:
##########
@@ -26,7 +26,8 @@ spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close  from s
 spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false)
 spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_rt where  symbol = 'GOOG'").show(100, false)
 
- // Copy-On-Write Bootstrapped table
+// Copy-On-Write Bootstrapped table
+spark.sql("set hoodie.bootstrap.data.queries.only=false")

Review Comment:
   Are there any integration test for bootstrap where we test with this feature on?



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala:
##########
@@ -100,7 +101,7 @@ class DefaultSource extends RelationProvider
       )
     } else {
       Map()
-    }) ++ DataSourceOptionsHelper.parametersWithReadDefaults(optParams)
+    }) ++ DataSourceOptionsHelper.parametersWithReadDefaults(sqlContext.getAllConfs.filter(k => k._1.startsWith("hoodie.")) ++ optParams)

Review Comment:
   Why is this needed ? 



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala:
##########
@@ -270,6 +271,21 @@ object DefaultSource {
     }
   }
 
+  private def resolveHoodieBootstrapRelation(sqlContext: SQLContext,
+                                             globPaths: Seq[Path],
+                                             userSchema: Option[StructType],
+                                             metaClient: HoodieTableMetaClient,
+                                             parameters: Map[String, String]): BaseRelation = {
+    val enableFileIndex = HoodieSparkConfUtils.getConfigValue(parameters, sqlContext.sparkSession.sessionState.conf,
+      ENABLE_HOODIE_FILE_INDEX.key, ENABLE_HOODIE_FILE_INDEX.defaultValue.toString).toBoolean
+    if (!enableFileIndex || globPaths.nonEmpty || parameters.getOrElse(HoodieBootstrapConfig.DATA_QUERIES_ONLY.key(), "true") != "true") {

Review Comment:
   @jonvex : Wouldn't this change cause user queries which includes hoodie metadata columns to fail ? Can't we just userschema being passed here to determine if there are any hoodie metadata columns being queried to determine appropriate next steps ?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org