You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by pr...@apache.org on 2016/03/21 18:32:19 UTC
hive git commit: HIVE-13291: ORC BI Split strategy should consider
block size instead of file size (Prasanth Jayachandran reviewed by Gopal V)
Repository: hive
Updated Branches:
refs/heads/master 4588c6076 -> 15220e8b5
HIVE-13291: ORC BI Split strategy should consider block size instead of file size (Prasanth Jayachandran reviewed by Gopal V)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/15220e8b
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/15220e8b
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/15220e8b
Branch: refs/heads/master
Commit: 15220e8b52bf934500ff8d98a131ae1059cfe6dc
Parents: 4588c60
Author: Prasanth Jayachandran <j....@gmail.com>
Authored: Mon Mar 21 12:31:52 2016 -0500
Committer: Prasanth Jayachandran <j....@gmail.com>
Committed: Mon Mar 21 12:31:52 2016 -0500
----------------------------------------------------------------------
.../hadoop/hive/ql/io/orc/OrcInputFormat.java | 14 +--
.../hive/ql/io/orc/TestInputOutputFormat.java | 95 ++++++++++++++++++++
2 files changed, 102 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/15220e8b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index 8b611bb..fe0be7b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -891,7 +891,6 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
private final boolean isOriginal;
private final List<DeltaMetaData> deltas;
private final FileSystem fs;
- private final Context context;
private final Path dir;
private final boolean allowSyntheticFileIds;
@@ -899,7 +898,6 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
Path dir, List<HdfsFileStatusWithId> fileStatuses, boolean isOriginal,
List<DeltaMetaData> deltas, boolean[] covered, boolean allowSyntheticFileIds) {
super(dir, context.numBuckets, deltas, covered);
- this.context = context;
this.fileStatuses = fileStatuses;
this.isOriginal = isOriginal;
this.deltas = deltas;
@@ -914,15 +912,17 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
for (HdfsFileStatusWithId file : fileStatuses) {
FileStatus fileStatus = file.getFileStatus();
if (fileStatus.getLen() != 0) {
- String[] hosts = SHIMS.getLocationsWithOffset(fs, fileStatus).firstEntry().getValue()
- .getHosts();
Object fileKey = file.getFileId();
if (fileKey == null && allowSyntheticFileIds) {
fileKey = new SyntheticFileId(fileStatus);
}
- OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey, 0,
- fileStatus.getLen(), hosts, null, isOriginal, true, deltas, -1);
- splits.add(orcSplit);
+ TreeMap<Long, BlockLocation> blockOffsets = SHIMS.getLocationsWithOffset(fs, fileStatus);
+ for (Map.Entry<Long, BlockLocation> entry : blockOffsets.entrySet()) {
+ OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey, entry.getKey(),
+ entry.getValue().getLength(), entry.getValue().getHosts(), null, isOriginal, true,
+ deltas, -1);
+ splits.add(orcSplit);
+ }
}
}
http://git-wip-us.apache.org/repos/asf/hive/blob/15220e8b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
index 1a64f3a..c88f6d8 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
@@ -555,6 +555,101 @@ public class TestInputOutputFormat {
}
@Test
+ public void testBIStrategySplitBlockBoundary() throws Exception {
+ conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "BI");
+ OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
+ MockFileSystem fs = new MockFileSystem(conf,
+ new MockFile("mock:/a/b/part-00", 1000, new byte[1], new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-01", 1000, new byte[1], new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-02", 1000, new byte[1], new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-03", 1000, new byte[1], new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-04", 1000, new byte[1], new MockBlock("host1", "host2")));
+ OrcInputFormat.FileGenerator gen =
+ new OrcInputFormat.FileGenerator(context, fs,
+ new MockPath(fs, "mock:/a/b"), false, null);
+ OrcInputFormat.SplitStrategy splitStrategy = createSplitStrategy(context, gen);
+ assertEquals(true, splitStrategy instanceof OrcInputFormat.BISplitStrategy);
+ List<OrcSplit> splits = splitStrategy.getSplits();
+ int numSplits = splits.size();
+ assertEquals(5, numSplits);
+
+ context = new OrcInputFormat.Context(conf);
+ fs = new MockFileSystem(conf,
+ new MockFile("mock:/a/b/part-00", 1000, new byte[1000], new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-01", 1000, new byte[1000], new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-02", 1000, new byte[1000], new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-03", 1000, new byte[1000], new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-04", 1000, new byte[1000], new MockBlock("host1", "host2")));
+ gen = new OrcInputFormat.FileGenerator(context, fs,
+ new MockPath(fs, "mock:/a/b"), false, null);
+ splitStrategy = createSplitStrategy(context, gen);
+ assertEquals(true, splitStrategy instanceof OrcInputFormat.BISplitStrategy);
+ splits = splitStrategy.getSplits();
+ numSplits = splits.size();
+ assertEquals(5, numSplits);
+
+ context = new OrcInputFormat.Context(conf);
+ fs = new MockFileSystem(conf,
+ new MockFile("mock:/a/b/part-00", 1000, new byte[1100], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-01", 1000, new byte[1100], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-02", 1000, new byte[1100], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-03", 1000, new byte[1100], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-04", 1000, new byte[1100], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2")));
+ gen = new OrcInputFormat.FileGenerator(context, fs,
+ new MockPath(fs, "mock:/a/b"), false, null);
+ splitStrategy = createSplitStrategy(context, gen);
+ assertEquals(true, splitStrategy instanceof OrcInputFormat.BISplitStrategy);
+ splits = splitStrategy.getSplits();
+ numSplits = splits.size();
+ assertEquals(10, numSplits);
+
+ context = new OrcInputFormat.Context(conf);
+ fs = new MockFileSystem(conf,
+ new MockFile("mock:/a/b/part-00", 1000, new byte[2000], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-01", 1000, new byte[2000], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-02", 1000, new byte[2000], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-03", 1000, new byte[2000], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-04", 1000, new byte[2000], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2")));
+ gen = new OrcInputFormat.FileGenerator(context, fs,
+ new MockPath(fs, "mock:/a/b"), false, null);
+ splitStrategy = createSplitStrategy(context, gen);
+ assertEquals(true, splitStrategy instanceof OrcInputFormat.BISplitStrategy);
+ splits = splitStrategy.getSplits();
+ numSplits = splits.size();
+ assertEquals(10, numSplits);
+
+ context = new OrcInputFormat.Context(conf);
+ fs = new MockFileSystem(conf,
+ new MockFile("mock:/a/b/part-00", 1000, new byte[2200], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2"), new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-01", 1000, new byte[2200], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2"), new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-02", 1000, new byte[2200], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2"), new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-03", 1000, new byte[2200], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2"), new MockBlock("host1", "host2")),
+ new MockFile("mock:/a/b/part-04", 1000, new byte[2200], new MockBlock("host1", "host2"),
+ new MockBlock("host1", "host2"), new MockBlock("host1", "host2")));
+ gen = new OrcInputFormat.FileGenerator(context, fs,
+ new MockPath(fs, "mock:/a/b"), false, null);
+ splitStrategy = createSplitStrategy(context, gen);
+ assertEquals(true, splitStrategy instanceof OrcInputFormat.BISplitStrategy);
+ splits = splitStrategy.getSplits();
+ numSplits = splits.size();
+ assertEquals(15, numSplits);
+ }
+
+ @Test
public void testEtlCombinedStrategy() throws Exception {
conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "ETL");
conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_DIRECTORY_BATCH_MS.varname, "1000000");