You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by pr...@apache.org on 2015/05/07 03:38:55 UTC
[3/3] hive git commit: LLAP: Implement Tez SplitSizeEstimator for Orc
(Prasanth Jayachandran)
LLAP: Implement Tez SplitSizeEstimator for Orc (Prasanth Jayachandran)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/7f21a425
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/7f21a425
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/7f21a425
Branch: refs/heads/llap
Commit: 7f21a4254dff893ff6d882ab66e018075a37d484
Parents: f16993d
Author: Prasanth Jayachandran <j....@gmail.com>
Authored: Wed May 6 18:38:36 2015 -0700
Committer: Prasanth Jayachandran <j....@gmail.com>
Committed: Wed May 6 18:38:36 2015 -0700
----------------------------------------------------------------------
.../ql/exec/tez/ColumnarSplitSizeEstimator.java | 59 ++++++++++++++++++++
.../hadoop/hive/ql/exec/tez/SplitGrouper.java | 2 +-
.../apache/hadoop/hive/ql/io/ColumnarSplit.java | 33 +++++++++++
.../hadoop/hive/ql/io/orc/OrcInputFormat.java | 2 +-
.../apache/hadoop/hive/ql/io/orc/OrcSplit.java | 8 ++-
5 files changed, 99 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/7f21a425/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java
new file mode 100644
index 0000000..bf830eb
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.tez;
+
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.io.ColumnarSplit;
+import org.apache.hadoop.hive.ql.io.HiveInputFormat;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.split.SplitSizeEstimator;
+
+/**
+ * Split size estimator for columnar file formats.
+ */
+public class ColumnarSplitSizeEstimator implements SplitSizeEstimator {
+ private static final Log LOG = LogFactory.getLog(ColumnarSplitSizeEstimator.class);
+ private static final boolean isDebugEnabled = LOG.isDebugEnabled();
+
+ @Override
+ public long getEstimatedSize(InputSplit inputSplit) throws IOException {
+ long colProjSize = inputSplit.getLength();
+
+ if (inputSplit instanceof ColumnarSplit) {
+ colProjSize = ((ColumnarSplit) inputSplit).getColumnarProjectionSize();
+ if (isDebugEnabled) {
+ LOG.debug("Estimated column projection size: " + colProjSize);
+ }
+ return colProjSize;
+ } else if (inputSplit instanceof HiveInputFormat.HiveInputSplit) {
+ InputSplit innerSplit = ((HiveInputFormat.HiveInputSplit) inputSplit).getInputSplit();
+
+ if (innerSplit instanceof ColumnarSplit) {
+ colProjSize = ((ColumnarSplit) innerSplit).getColumnarProjectionSize();
+ if (isDebugEnabled) {
+ LOG.debug("Estimated column projection size: " + colProjSize);
+ }
+ return colProjSize;
+ }
+ }
+ return colProjSize;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/7f21a425/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java
index c12da37..f95aabf 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java
@@ -90,7 +90,7 @@ public class SplitGrouper {
InputSplit[] rawSplits = inputSplitCollection.toArray(new InputSplit[0]);
InputSplit[] groupedSplits =
tezGrouper.getGroupedSplits(conf, rawSplits, bucketTaskMap.get(bucketId),
- HiveInputFormat.class.getName());
+ HiveInputFormat.class.getName(), new ColumnarSplitSizeEstimator());
LOG.info("Original split size is " + rawSplits.length + " grouped split size is "
+ groupedSplits.length + ", for bucket: " + bucketId);
http://git-wip-us.apache.org/repos/asf/hive/blob/7f21a425/ql/src/java/org/apache/hadoop/hive/ql/io/ColumnarSplit.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/ColumnarSplit.java b/ql/src/java/org/apache/hadoop/hive/ql/io/ColumnarSplit.java
new file mode 100644
index 0000000..ed8fc35
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/ColumnarSplit.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.io;
+
+/**
+ * Interface when implemented should return the estimated size of columnar projections
+ * that will be read from the split. This information will be used by split grouper for better
+ * grouping based on the actual data read instead of the complete split length.
+ */
+public interface ColumnarSplit {
+
+ /**
+ * Return the estimation size of the column projections that will be read from this split.
+ *
+ * @return - estimated column projection size that will be read in bytes
+ */
+ long getColumnarProjectionSize();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/7f21a425/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index 2548106..340f55c 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -1053,7 +1053,7 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
if (isDebugEnabled) {
for (OrcSplit split : splits) {
LOG.debug(split + " projected_columns_uncompressed_size: "
- + split.getProjectedColumnsUncompressedSize());
+ + split.getColumnarProjectionSize());
}
}
return splits;
http://git-wip-us.apache.org/repos/asf/hive/blob/7f21a425/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java
index 1263346..fa78703 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java
@@ -28,6 +28,7 @@ import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.io.ColumnarSplit;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.FileSplit;
@@ -38,7 +39,7 @@ import org.apache.hadoop.mapred.FileSplit;
* OrcFileSplit. Holds file meta info
*
*/
-public class OrcSplit extends FileSplit {
+public class OrcSplit extends FileSplit implements ColumnarSplit {
private static final Log LOG = LogFactory.getLog(OrcSplit.class);
private FileMetaInfo fileMetaInfo;
@@ -74,7 +75,7 @@ public class OrcSplit extends FileSplit {
this.isOriginal = isOriginal;
this.hasBase = hasBase;
this.deltas.addAll(deltas);
- this.projColsUncompressedSize = projectedDataSize;
+ this.projColsUncompressedSize = projectedDataSize <= 0 ? length : projectedDataSize;
}
@Override
@@ -173,7 +174,8 @@ public class OrcSplit extends FileSplit {
return fileId;
}
- public long getProjectedColumnsUncompressedSize() {
+ @Override
+ public long getColumnarProjectionSize() {
return projColsUncompressedSize;
}