You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2019/03/09 17:15:17 UTC
[hive] branch master updated: HIVE-21390 : BI split strategy does
not work for blob stores (Prasanth Jayachandran reviewed by Gopal V)
This is an automated email from the ASF dual-hosted git repository.
hashutosh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 13ec7c3 HIVE-21390 : BI split strategy does not work for blob stores (Prasanth Jayachandran reviewed by Gopal V)
13ec7c3 is described below
commit 13ec7c3ab47001df25e7be87739731192075b0a7
Author: Prasanth Jayachandran <pr...@apache.org>
AuthorDate: Sat Mar 9 09:13:59 2019 -0800
HIVE-21390 : BI split strategy does not work for blob stores (Prasanth Jayachandran reviewed by Gopal V)
Signed-off-by: Ashutosh Chauhan <ha...@apache.org>
---
.../java/org/apache/hadoop/hive/conf/HiveConf.java | 3 +++
.../hive/hcatalog/streaming/TestStreaming.java | 4 ++++
.../streaming/mutate/StreamingTestUtils.java | 5 ++++
.../hadoop/hive/ql/io/orc/OrcInputFormat.java | 27 ++++++++++++++++------
.../hive/ql/io/orc/TestInputOutputFormat.java | 5 ++++
5 files changed, 37 insertions(+), 7 deletions(-)
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 076035b..c33d03e 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -1984,6 +1984,9 @@ public class HiveConf extends Configuration {
" ETL strategy is used when spending little more time in split generation is acceptable" +
" (split generation reads and caches file footers). HYBRID chooses between the above strategies" +
" based on heuristics."),
+ HIVE_ORC_BLOB_STORAGE_SPLIT_SIZE("hive.exec.orc.blob.storage.split.size", 128L * 1024 * 1024,
+ "When blob storage is used, BI split strategy does not have block locations for splitting orc files.\n" +
+ "In such cases, split generation will use this config to split orc file"),
HIVE_ORC_WRITER_LLAP_MEMORY_MANAGER_ENABLED("hive.exec.orc.writer.llap.memory.manager.enabled", true,
"Whether orc writers should use llap-aware memory manager. LLAP aware memory manager will use memory\n" +
"per executor instead of entire heap memory when concurrent orc writers are involved. This will let\n" +
diff --git a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java
index d0d9759..4dc04f4 100644
--- a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java
+++ b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/TestStreaming.java
@@ -133,6 +133,10 @@ public class TestStreaming {
return NAME;
}
+ @Override
+ public String getScheme() {
+ return "raw";
+ }
@Override
public FileStatus getFileStatus(Path path) throws IOException {
diff --git a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/mutate/StreamingTestUtils.java b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/mutate/StreamingTestUtils.java
index 63690f9..afda7d5 100644
--- a/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/mutate/StreamingTestUtils.java
+++ b/hcatalog/streaming/src/test/org/apache/hive/hcatalog/streaming/mutate/StreamingTestUtils.java
@@ -90,6 +90,11 @@ public class StreamingTestUtils {
}
@Override
+ public String getScheme() {
+ return "raw";
+ }
+
+ @Override
public FileStatus getFileStatus(Path path) throws IOException {
File file = pathToFile(path);
if (!file.exists()) {
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index ca25449..9dac185 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -18,6 +18,7 @@
package org.apache.hadoop.hive.ql.io.orc;
+import org.apache.hadoop.hive.common.BlobStorageUtils;
import org.apache.hadoop.hive.common.NoDynamicValuesException;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hdfs.DistributedFileSystem;
@@ -1074,16 +1075,28 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
if (fileKey == null && allowSyntheticFileIds) {
fileKey = new SyntheticFileId(fileStatus);
}
- TreeMap<Long, BlockLocation> blockOffsets = SHIMS.getLocationsWithOffset(fs, fileStatus);
- for (Map.Entry<Long, BlockLocation> entry : blockOffsets.entrySet()) {
- if(entry.getKey() + entry.getValue().getLength() > logicalLen) {
- //don't create splits for anything past logical EOF
- continue;
+ if (BlobStorageUtils.isBlobStorageFileSystem(conf, fs)) {
+ final long splitSize = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVE_ORC_BLOB_STORAGE_SPLIT_SIZE);
+ LOG.info("Blob storage detected for BI split strategy. Splitting files at boundary {}..", splitSize);
+ long start;
+ for (start = 0; start < logicalLen; start = start + splitSize) {
+ OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey, start,
+ Math.min(splitSize, logicalLen - start), null, null, isOriginal, true,
+ deltas, -1, logicalLen, dir, offsetAndBucket);
+ splits.add(orcSplit);
}
- OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey, entry.getKey(),
+ } else {
+ TreeMap<Long, BlockLocation> blockOffsets = SHIMS.getLocationsWithOffset(fs, fileStatus);
+ for (Map.Entry<Long, BlockLocation> entry : blockOffsets.entrySet()) {
+ if (entry.getKey() + entry.getValue().getLength() > logicalLen) {
+ //don't create splits for anything past logical EOF
+ continue;
+ }
+ OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey, entry.getKey(),
entry.getValue().getLength(), entry.getValue().getHosts(), null, isOriginal, true,
deltas, -1, logicalLen, dir, offsetAndBucket);
- splits.add(orcSplit);
+ splits.add(orcSplit);
+ }
}
}
}
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
index 25cd657..9a8ae3b 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
@@ -1237,6 +1237,11 @@ public class TestInputOutputFormat {
}
}
+ @Override
+ public String getScheme() {
+ return "mock";
+ }
+
// increments file modification time
public void touch(MockFile file) {
if (fileStatusMap.containsKey(file)) {