You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2019/03/09 17:15:17 UTC
[hive] branch master updated: HIVE-21390 : BI split strategy does not work for blob stores (Prasanth Jayachandran reviewed by Gopal V)

This is an automated email from the ASF dual-hosted git repository.

hashutosh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 13ec7c3  HIVE-21390 : BI split strategy does not work for blob stores (Prasanth Jayachandran reviewed by Gopal V)
13ec7c3 is described below

commit 13ec7c3ab47001df25e7be87739731192075b0a7
Author: Prasanth Jayachandran <pr...@apache.org>
AuthorDate: Sat Mar 9 09:13:59 2019 -0800

    HIVE-21390 : BI split strategy does not work for blob stores (Prasanth Jayachandran reviewed by Gopal V)
    
    Signed-off-by: Ashutosh Chauhan <ha...@apache.org>
---
 .../java/org/apache/hadoop/hive/conf/HiveConf.java |  3 +++
 .../hive/hcatalog/streaming/TestStreaming.java     |  4 ++++
 .../streaming/mutate/StreamingTestUtils.java       |  5 ++++
 .../hadoop/hive/ql/io/orc/OrcInputFormat.java      | 27 ++++++++++++++++------
 .../hive/ql/io/orc/TestInputOutputFormat.java      |  5 ++++
 5 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 076035b..c33d03e 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -1984,6 +1984,9 @@ public class HiveConf extends Configuration {
         " ETL strategy is used when spending little more time in split generation is acceptable" +
         " (split generation reads and caches file footers). HYBRID chooses between the above strategies" +
         " based on heuristics."),
+    HIVE_ORC_BLOB_STORAGE_SPLIT_SIZE("hive.exec.orc.blob.storage.split.size", 128L * 1024 * 1024,
+      "When blob storage is used, BI split strategy does not have block locations for splitting orc files.\n" +
+        "In such cases, split generation will use this config to split orc file"),
     HIVE_ORC_WRITER_LLAP_MEMORY_MANAGER_ENABLED("hive.exec.orc.writer.llap.memory.manager.enabled", true,
       "Whether orc writers should use llap-aware memory manager. LLAP aware memory manager will use memory\n" +
         "per executor instead of entire heap memory when concurrent orc writers are involved. This will let\n" +
diff --git a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java
index d0d9759..4dc04f4 100644
--- a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java
+++ b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java
@@ -133,6 +133,10 @@ public class TestStreaming {
       return NAME;
     }
 
+    @Override
+    public String getScheme() {
+      return "raw";
+    }
 
     @Override
     public FileStatus getFileStatus(Path path) throws IOException {
diff --git a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/mutate/StreamingTestUtils.java b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/mutate/StreamingTestUtils.java
index 63690f9..afda7d5 100644
--- a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/mutate/StreamingTestUtils.java
+++ b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/mutate/StreamingTestUtils.java
@@ -90,6 +90,11 @@ public class StreamingTestUtils {
     }
 
     @Override
+    public String getScheme() {
+      return "raw";
+    }
+
+    @Override
     public FileStatus getFileStatus(Path path) throws IOException {
       File file = pathToFile(path);
       if (!file.exists()) {
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index ca25449..9dac185 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -18,6 +18,7 @@
 
 package org.apache.hadoop.hive.ql.io.orc;
 
+import org.apache.hadoop.hive.common.BlobStorageUtils;
 import org.apache.hadoop.hive.common.NoDynamicValuesException;
 import org.apache.hadoop.fs.PathFilter;
 import org.apache.hadoop.hdfs.DistributedFileSystem;
@@ -1074,16 +1075,28 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
           if (fileKey == null && allowSyntheticFileIds) {
             fileKey = new SyntheticFileId(fileStatus);
           }
-          TreeMap<Long, BlockLocation> blockOffsets = SHIMS.getLocationsWithOffset(fs, fileStatus);
-          for (Map.Entry<Long, BlockLocation> entry : blockOffsets.entrySet()) {
-            if(entry.getKey() + entry.getValue().getLength() > logicalLen) {
-              //don't create splits for anything past logical EOF
-              continue;
+          if (BlobStorageUtils.isBlobStorageFileSystem(conf, fs)) {
+            final long splitSize = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVE_ORC_BLOB_STORAGE_SPLIT_SIZE);
+            LOG.info("Blob storage detected for BI split strategy. Splitting files at boundary {}..", splitSize);
+            long start;
+            for (start = 0; start < logicalLen; start = start + splitSize) {
+              OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey, start,
+                Math.min(splitSize, logicalLen - start), null, null, isOriginal, true,
+                deltas, -1, logicalLen, dir, offsetAndBucket);
+              splits.add(orcSplit);
             }
-            OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey, entry.getKey(),
+          } else {
+            TreeMap<Long, BlockLocation> blockOffsets = SHIMS.getLocationsWithOffset(fs, fileStatus);
+            for (Map.Entry<Long, BlockLocation> entry : blockOffsets.entrySet()) {
+              if (entry.getKey() + entry.getValue().getLength() > logicalLen) {
+                //don't create splits for anything past logical EOF
+                continue;
+              }
+              OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey, entry.getKey(),
                 entry.getValue().getLength(), entry.getValue().getHosts(), null, isOriginal, true,
                 deltas, -1, logicalLen, dir, offsetAndBucket);
-            splits.add(orcSplit);
+              splits.add(orcSplit);
+            }
           }
         }
       }
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
index 25cd657..9a8ae3b 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
@@ -1237,6 +1237,11 @@ public class TestInputOutputFormat {
       }
     }
 
+    @Override
+    public String getScheme() {
+      return "mock";
+    }
+
     // increments file modification time
     public void touch(MockFile file) {
       if (fileStatusMap.containsKey(file)) {