You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by da...@apache.org on 2021/09/23 07:14:45 UTC
[hudi] branch master updated: [HUDI-2479] HoodieFileIndex throws NPE for FileSlice with pure log files (#3702)

This is an automated email from the ASF dual-hosted git repository.

danny0405 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
     new 5515a0d  [HUDI-2479] HoodieFileIndex throws NPE for FileSlice with pure log files (#3702)
5515a0d is described below

commit 5515a0d319cbac835c65f6d21898ac1399d77ea3
Author: Danny Chan <yu...@gmail.com>
AuthorDate: Thu Sep 23 15:14:30 2021 +0800

    [HUDI-2479] HoodieFileIndex throws NPE for FileSlice with pure log files (#3702)
---
 .../src/main/scala/org/apache/hudi/HoodieFileIndex.scala    | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala
index b87be99..f771bc3 100644
--- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala
+++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala
@@ -56,7 +56,7 @@ import scala.collection.mutable
  *
  * 2、If the partition columns size is not equal to the partition path level, but the partition
  * column size is "1" (e.g. partition column is "dt", but the partition path is "2021/03/10"
- * who'es directory level is 3).We can still read it as a partitioned table. We will mapping the
+ * who's directory level is 3).We can still read it as a partitioned table. We will mapping the
  * partition path (e.g. 2021/03/10) to the only partition column (e.g. "dt").
  *
  * 3、Else the the partition columns size is not equal to the partition directory level and the
@@ -256,7 +256,7 @@ case class HoodieFileIndex(
             .iterator().asScala.toSeq
           (p._1, fileSlices)
         })
-        cachedFileSize = cachedAllInputFileSlices.values.flatten.map(_.getBaseFile.get().getFileLen).sum
+        cachedFileSize = cachedAllInputFileSlices.values.flatten.map(fileSliceSize).sum
     }
 
     // If the partition value contains InternalRow.empty, we query it as a non-partitioned table.
@@ -266,6 +266,15 @@ case class HoodieFileIndex(
       s" spend: $flushSpend ms")
   }
 
+  private def fileSliceSize(fileSlice: FileSlice): Long = {
+    val logFileSize = fileSlice.getLogFiles.iterator().asScala.map(_.getFileSize).filter(_ > 0).sum
+    if (fileSlice.getBaseFile.isPresent) {
+      fileSlice.getBaseFile.get().getFileLen + logFileSize
+    } else {
+      logFileSize
+    }
+  }
+
   override def sizeInBytes: Long = {
     cachedFileSize
   }