You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by vi...@apache.org on 2021/01/18 15:30:04 UTC
[hudi] branch master updated: [HUDI-1529] Add block size to the
FileStatus objects returned from metadata table to avoid too many file
splits (#2451)
This is an automated email from the ASF dual-hosted git repository.
vinoth pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 684e12e [HUDI-1529] Add block size to the FileStatus objects returned from metadata table to avoid too many file splits (#2451)
684e12e is described below
commit 684e12e9fcfa45c6ac922a84fb3116ac8142bc18
Author: Udit Mehrotra <ud...@gmail.com>
AuthorDate: Mon Jan 18 07:29:53 2021 -0800
[HUDI-1529] Add block size to the FileStatus objects returned from metadata table to avoid too many file splits (#2451)
---
.../org/apache/hudi/metadata/TestHoodieBackedMetadata.java | 8 ++++++++
.../main/java/org/apache/hudi/metadata/BaseTableMetadata.java | 2 +-
.../java/org/apache/hudi/metadata/HoodieMetadataPayload.java | 10 +++++++---
3 files changed, 16 insertions(+), 4 deletions(-)
diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/metadata/TestHoodieBackedMetadata.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/metadata/TestHoodieBackedMetadata.java
index 3d770c7..16ee120 100644
--- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/metadata/TestHoodieBackedMetadata.java
+++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/metadata/TestHoodieBackedMetadata.java
@@ -801,6 +801,14 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
// File sizes should be valid
Arrays.stream(metaStatuses).forEach(s -> assertTrue(s.getLen() > 0));
+ // Block sizes should be valid
+ Arrays.stream(metaStatuses).forEach(s -> assertTrue(s.getBlockSize() > 0));
+ List<Long> fsBlockSizes = Arrays.stream(fsStatuses).map(FileStatus::getBlockSize).collect(Collectors.toList());
+ Collections.sort(fsBlockSizes);
+ List<Long> metadataBlockSizes = Arrays.stream(metaStatuses).map(FileStatus::getBlockSize).collect(Collectors.toList());
+ Collections.sort(metadataBlockSizes);
+ assertEquals(fsBlockSizes, metadataBlockSizes);
+
if ((fsFileNames.size() != metadataFilenames.size()) || (!fsFileNames.equals(metadataFilenames))) {
LOG.info("*** File system listing = " + Arrays.toString(fsFileNames.toArray()));
LOG.info("*** Metadata listing = " + Arrays.toString(metadataFilenames.toArray()));
diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java b/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java
index 4ae71de..de0a3c4 100644
--- a/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java
+++ b/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java
@@ -202,7 +202,7 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata {
throw new HoodieMetadataException("Metadata record for partition " + partitionName + " is inconsistent: "
+ hoodieRecord.get().getData());
}
- statuses = hoodieRecord.get().getData().getFileStatuses(partitionPath);
+ statuses = hoodieRecord.get().getData().getFileStatuses(hadoopConf.get(), partitionPath);
}
if (validateLookups) {
diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java
index 0863f7e..9c6eb89 100644
--- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java
+++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java
@@ -29,7 +29,9 @@ import org.apache.hudi.exception.HoodieMetadataException;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
@@ -177,10 +179,12 @@ public class HoodieMetadataPayload implements HoodieRecordPayload<HoodieMetadata
/**
* Returns the files added as part of this record.
*/
- public FileStatus[] getFileStatuses(Path partitionPath) {
+ public FileStatus[] getFileStatuses(Configuration hadoopConf, Path partitionPath) throws IOException {
+ FileSystem fs = partitionPath.getFileSystem(hadoopConf);
+ long blockSize = fs.getDefaultBlockSize(partitionPath);
return filterFileInfoEntries(false)
- .map(e -> new FileStatus(e.getValue().getSize(), false, 0, 0, 0, 0, null, null, null,
- new Path(partitionPath, e.getKey())))
+ .map(e -> new FileStatus(e.getValue().getSize(), false, 0, blockSize, 0, 0,
+ null, null, null, new Path(partitionPath, e.getKey())))
.toArray(FileStatus[]::new);
}