You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2021/05/25 15:59:37 UTC

[GitHub] [hudi] vinothchandar commented on a change in pull request #2926: [HUDI-1879] Support Partition Prune For MergeOnRead Snapshot Table

vinothchandar commented on a change in pull request #2926:
URL: https://github.com/apache/hudi/pull/2926#discussion_r638929143



##########
File path: hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala
##########
@@ -131,15 +133,28 @@ class MergeOnReadSnapshotRelation(val sqlContext: SQLContext,
     rdd.asInstanceOf[RDD[Row]]
   }
 
-  def buildFileIndex(): List[HoodieMergeOnReadFileSplit] = {
+  def buildFileIndex(filters: Array[Filter]): List[HoodieMergeOnReadFileSplit] = {
+
     val fileStatuses = if (globPaths.isDefined) {
       // Load files from the global paths if it has defined to be compatible with the original mode
       val inMemoryFileIndex = HoodieSparkUtils.createInMemoryFileIndex(sqlContext.sparkSession, globPaths.get)
       inMemoryFileIndex.allFiles()
     } else { // Load files by the HoodieFileIndex.
       val hoodieFileIndex = HoodieFileIndex(sqlContext.sparkSession, metaClient,
         Some(tableStructSchema), optParams, FileStatusCache.getOrCreate(sqlContext.sparkSession))
-      hoodieFileIndex.allFiles
+
+      // Get partition filter and convert to catalyst expression
+      val partitionColumns = hoodieFileIndex.partitionSchema.fieldNames.toSet
+      val partitionFilters= filters.filter(f => f.references.forall(p => partitionColumns.contains(p)))

Review comment:
       nit: space before `=` 

##########
File path: hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala
##########
@@ -182,4 +197,98 @@ object MergeOnReadSnapshotRelation {
     // when create PartitionedFile.
     path.toUri.toString
   }
+
+  /**
+   * Convert Filters to Catalyst Expressions and joined by And. If convert success return an
+   * Non-Empty Option[Expression],or else return None.
+   */
+  def convertToCatalystExpressions(filters: Array[Filter],

Review comment:
       can we encapsulate this conversion logic into its own class. I could see general use for this, beyond just partition pruning?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org