You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by vi...@apache.org on 2021/01/18 15:30:04 UTC

[hudi] branch master updated: [HUDI-1529] Add block size to the FileStatus objects returned from metadata table to avoid too many file splits (#2451)

This is an automated email from the ASF dual-hosted git repository.

vinoth pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
     new 684e12e  [HUDI-1529] Add block size to the FileStatus objects returned from metadata table to avoid too many file splits (#2451)
684e12e is described below

commit 684e12e9fcfa45c6ac922a84fb3116ac8142bc18
Author: Udit Mehrotra <ud...@gmail.com>
AuthorDate: Mon Jan 18 07:29:53 2021 -0800

    [HUDI-1529] Add block size to the FileStatus objects returned from metadata table to avoid too many file splits (#2451)
---
 .../org/apache/hudi/metadata/TestHoodieBackedMetadata.java     |  8 ++++++++
 .../main/java/org/apache/hudi/metadata/BaseTableMetadata.java  |  2 +-
 .../java/org/apache/hudi/metadata/HoodieMetadataPayload.java   | 10 +++++++---
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/metadata/TestHoodieBackedMetadata.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/metadata/TestHoodieBackedMetadata.java
index 3d770c7..16ee120 100644
--- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/metadata/TestHoodieBackedMetadata.java
+++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/metadata/TestHoodieBackedMetadata.java
@@ -801,6 +801,14 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
         // File sizes should be valid
         Arrays.stream(metaStatuses).forEach(s -> assertTrue(s.getLen() > 0));
 
+        // Block sizes should be valid
+        Arrays.stream(metaStatuses).forEach(s -> assertTrue(s.getBlockSize() > 0));
+        List<Long> fsBlockSizes = Arrays.stream(fsStatuses).map(FileStatus::getBlockSize).collect(Collectors.toList());
+        Collections.sort(fsBlockSizes);
+        List<Long> metadataBlockSizes = Arrays.stream(metaStatuses).map(FileStatus::getBlockSize).collect(Collectors.toList());
+        Collections.sort(metadataBlockSizes);
+        assertEquals(fsBlockSizes, metadataBlockSizes);
+
         if ((fsFileNames.size() != metadataFilenames.size()) || (!fsFileNames.equals(metadataFilenames))) {
           LOG.info("*** File system listing = " + Arrays.toString(fsFileNames.toArray()));
           LOG.info("*** Metadata listing = " + Arrays.toString(metadataFilenames.toArray()));
diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java b/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java
index 4ae71de..de0a3c4 100644
--- a/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java
+++ b/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java
@@ -202,7 +202,7 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata {
         throw new HoodieMetadataException("Metadata record for partition " + partitionName + " is inconsistent: "
             + hoodieRecord.get().getData());
       }
-      statuses = hoodieRecord.get().getData().getFileStatuses(partitionPath);
+      statuses = hoodieRecord.get().getData().getFileStatuses(hadoopConf.get(), partitionPath);
     }
 
     if (validateLookups) {
diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java
index 0863f7e..9c6eb89 100644
--- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java
+++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java
@@ -29,7 +29,9 @@ import org.apache.hudi.exception.HoodieMetadataException;
 import org.apache.avro.Schema;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.avro.generic.IndexedRecord;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 
 import java.io.IOException;
@@ -177,10 +179,12 @@ public class HoodieMetadataPayload implements HoodieRecordPayload<HoodieMetadata
   /**
    * Returns the files added as part of this record.
    */
-  public FileStatus[] getFileStatuses(Path partitionPath) {
+  public FileStatus[] getFileStatuses(Configuration hadoopConf, Path partitionPath) throws IOException {
+    FileSystem fs = partitionPath.getFileSystem(hadoopConf);
+    long blockSize = fs.getDefaultBlockSize(partitionPath);
     return filterFileInfoEntries(false)
-        .map(e -> new FileStatus(e.getValue().getSize(), false, 0, 0, 0, 0, null, null, null,
-            new Path(partitionPath, e.getKey())))
+        .map(e -> new FileStatus(e.getValue().getSize(), false, 0, blockSize, 0, 0,
+            null, null, null, new Path(partitionPath, e.getKey())))
         .toArray(FileStatus[]::new);
   }