You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2022/02/14 19:42:45 UTC
[GitHub] [hudi] yihua commented on a change in pull request #4743: [HUDI-3280] Cleaning up Hive-related hierarchies after refactoring

yihua commented on a change in pull request #4743:
URL: https://github.com/apache/hudi/pull/4743#discussion_r806151420



##########
File path: hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieCopyOnWriteTableInputFormat.java
##########
@@ -200,6 +184,16 @@ protected boolean includeLogFilesForSnapshotView() {
     return HoodieInputFormatUtils.filterIncrementalFileStatus(jobContext, tableMetaClient, timeline.get(), fileStatuses, commitsToCheck.get());
   }
 
+  protected FileStatus createFileStatusUnchecked(FileSlice fileSlice, HiveHoodieTableFileIndex fileIndex, Option<HoodieVirtualKeyInfo> virtualKeyInfoOpt) {

Review comment:
       nit: remove unused `fileIndex`?

##########
File path: hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java
##########
@@ -147,18 +148,21 @@ public WriteOperationType getOperationType() {
    * been touched multiple times in the given commits, the return value will keep the one
    * from the latest commit.
    *
+   *
+   * @param hadoopConf
    * @param basePath The base path
    * @return the file full path to file status mapping
    */
-  public Map<String, FileStatus> getFullPathToFileStatus(String basePath) {
+  public Map<String, FileStatus> getFullPathToFileStatus(Configuration hadoopConf, String basePath) {
     Map<String, FileStatus> fullPathToFileStatus = new HashMap<>();
     for (List<HoodieWriteStat> stats : getPartitionToWriteStats().values()) {
       // Iterate through all the written files.
       for (HoodieWriteStat stat : stats) {
         String relativeFilePath = stat.getPath();
         Path fullPath = relativeFilePath != null ? FSUtils.getPartitionPath(basePath, relativeFilePath) : null;
         if (fullPath != null) {
-          FileStatus fileStatus = new FileStatus(stat.getFileSizeInBytes(), false, 0, 0,
+          long blockSize = FSUtils.getFs(fullPath.toString(), hadoopConf).getDefaultBlockSize(fullPath);

Review comment:
       Should the block size be extracted outside the loop based on the file system of base path, since it's a file-system config?

##########
File path: hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieTableInputFormat.java
##########
@@ -0,0 +1,54 @@
+package org.apache.hudi.hadoop;
+
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.ArrayWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.JobConf;
+
+import java.io.IOException;
+
+/**
+ * Abstract base class of the Hive's {@link FileInputFormat} implementations allowing for reading of Hudi's
+ * Copy-on-Write (COW) and Merge-on-Read (MOR) tables
+ */
+public abstract class HoodieTableInputFormat extends FileInputFormat<NullWritable, ArrayWritable>
+    implements Configurable {
+
+  protected Configuration conf;
+
+  @Override
+  public final Configuration getConf() {
+    return conf;
+  }
+
+  @Override
+  public final void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  @Override
+  protected boolean isSplitable(FileSystem fs, Path filename) {
+    return super.isSplitable(fs, filename);
+  }
+
+  @Override
+  protected FileSplit makeSplit(Path file, long start, long length, String[] hosts) {
+    return super.makeSplit(file, start, length, hosts);
+  }
+
+  @Override
+  protected FileSplit makeSplit(Path file, long start, long length, String[] hosts, String[] inMemoryHosts) {
+    return super.makeSplit(file, start, length, hosts, inMemoryHosts);
+  }
+
+  @Override
+  protected FileStatus[] listStatus(JobConf job) throws IOException {
+    return super.listStatus(job);
+  }

Review comment:
       these seem not necessary?

##########
File path: hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/RealtimeFileStatus.java
##########
@@ -34,51 +35,62 @@
  * in Path.
  */
 public class RealtimeFileStatus extends FileStatus {

Review comment:
       Do you want to rename this and other classes with `Realtime` word as well?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org