You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2019/05/24 03:00:14 UTC

[GitHub] [incubator-hudi] vinothchandar commented on a change in pull request #689: [HUDI-25] Optimize HoodieInputFormat.listStatus for faster Hive Incremental queries

vinothchandar commented on a change in pull request #689: [HUDI-25] Optimize HoodieInputFormat.listStatus for faster Hive Incremental queries
URL: https://github.com/apache/incubator-hudi/pull/689#discussion_r287200410
 
 

 ##########
 File path: hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieInputFormat.java
 ##########
 @@ -62,65 +66,53 @@
 
   @Override
   public FileStatus[] listStatus(JobConf job) throws IOException {
-    // Get all the file status from FileInputFormat and then do the filter
-    FileStatus[] fileStatuses = super.listStatus(job);
-    Map<HoodieTableMetaClient, List<FileStatus>> groupedFileStatus = groupFileStatus(fileStatuses);
-    LOG.info("Found a total of " + groupedFileStatus.size() + " groups");
+    // Segregate inputPaths[] to incremental, non incremental and non hoodie paths
+    List<String> incrementalTables = HoodieHiveUtil.getIncrementalTableNames(Job.getInstance(job));
+    InputPathHandler inputPathHandler = new InputPathHandler(getInputPaths(job), incrementalTables);
     List<FileStatus> returns = new ArrayList<>();
-    for (Map.Entry<HoodieTableMetaClient, List<FileStatus>> entry : groupedFileStatus.entrySet()) {
-      HoodieTableMetaClient metadata = entry.getKey();
-      if (metadata == null) {
-        // Add all the paths which are not hoodie specific
-        returns.addAll(entry.getValue());
+
+    Map<String, HoodieTableMetaClient> tableMetaClientMap = inputPathHandler.tableMetaClientMap;
+    // process incremental pulls first
+    for (String table : incrementalTables) {
+      HoodieTableMetaClient metaClient = tableMetaClientMap.get(table);
+      if (metaClient == null) {
         continue;
       }
+      List<Path> inputPaths = inputPathHandler.groupedIncrementalPaths.get(metaClient);
+      List<FileStatus> result = listStatusForIncrementalMode(job, metaClient, inputPaths);
+      if (result != null) {
+        returns.addAll(result);
+      }
+    }
 
-      FileStatus[] statuses = entry.getValue().toArray(new FileStatus[entry.getValue().size()]);
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("Hoodie Metadata initialized with completed commit Ts as :" + metadata);
+    // process non hoodie Paths next.
+    List<Path> nonHoodiePaths = inputPathHandler.nonHoodieInputPaths;
+    if (nonHoodiePaths.size() > 0) {
+      setInputPaths(job, nonHoodiePaths.toArray(new Path[nonHoodiePaths.size()]));
+      FileStatus[] fileStatuses = super.listStatus(job);
 
 Review comment:
   in the same vein, should n't we reuse the fileStatuses value we obtain here for hoodie path as well 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services