You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by pr...@apache.org on 2015/05/07 03:38:55 UTC

[3/3] hive git commit: LLAP: Implement Tez SplitSizeEstimator for Orc (Prasanth Jayachandran)

LLAP: Implement Tez SplitSizeEstimator for Orc (Prasanth Jayachandran)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/7f21a425
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/7f21a425
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/7f21a425

Branch: refs/heads/llap
Commit: 7f21a4254dff893ff6d882ab66e018075a37d484
Parents: f16993d
Author: Prasanth Jayachandran <j....@gmail.com>
Authored: Wed May 6 18:38:36 2015 -0700
Committer: Prasanth Jayachandran <j....@gmail.com>
Committed: Wed May 6 18:38:36 2015 -0700

----------------------------------------------------------------------
 .../ql/exec/tez/ColumnarSplitSizeEstimator.java | 59 ++++++++++++++++++++
 .../hadoop/hive/ql/exec/tez/SplitGrouper.java   |  2 +-
 .../apache/hadoop/hive/ql/io/ColumnarSplit.java | 33 +++++++++++
 .../hadoop/hive/ql/io/orc/OrcInputFormat.java   |  2 +-
 .../apache/hadoop/hive/ql/io/orc/OrcSplit.java  |  8 ++-
 5 files changed, 99 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/7f21a425/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java
new file mode 100644
index 0000000..bf830eb
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.tez;
+
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.io.ColumnarSplit;
+import org.apache.hadoop.hive.ql.io.HiveInputFormat;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.split.SplitSizeEstimator;
+
+/**
+ * Split size estimator for columnar file formats.
+ */
+public class ColumnarSplitSizeEstimator implements SplitSizeEstimator {
+  private static final Log LOG = LogFactory.getLog(ColumnarSplitSizeEstimator.class);
+  private static final boolean isDebugEnabled = LOG.isDebugEnabled();
+
+  @Override
+  public long getEstimatedSize(InputSplit inputSplit) throws IOException {
+    long colProjSize = inputSplit.getLength();
+
+    if (inputSplit instanceof ColumnarSplit) {
+      colProjSize = ((ColumnarSplit) inputSplit).getColumnarProjectionSize();
+      if (isDebugEnabled) {
+        LOG.debug("Estimated column projection size: " + colProjSize);
+      }
+      return colProjSize;
+    } else if (inputSplit instanceof HiveInputFormat.HiveInputSplit) {
+      InputSplit innerSplit = ((HiveInputFormat.HiveInputSplit) inputSplit).getInputSplit();
+
+      if (innerSplit instanceof ColumnarSplit) {
+        colProjSize = ((ColumnarSplit) innerSplit).getColumnarProjectionSize();
+        if (isDebugEnabled) {
+          LOG.debug("Estimated column projection size: " + colProjSize);
+        }
+        return colProjSize;
+      }
+    }
+    return colProjSize;
+  }
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/7f21a425/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java
index c12da37..f95aabf 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java
@@ -90,7 +90,7 @@ public class SplitGrouper {
       InputSplit[] rawSplits = inputSplitCollection.toArray(new InputSplit[0]);
       InputSplit[] groupedSplits =
           tezGrouper.getGroupedSplits(conf, rawSplits, bucketTaskMap.get(bucketId),
-              HiveInputFormat.class.getName());
+              HiveInputFormat.class.getName(), new ColumnarSplitSizeEstimator());
 
       LOG.info("Original split size is " + rawSplits.length + " grouped split size is "
           + groupedSplits.length + ", for bucket: " + bucketId);

http://git-wip-us.apache.org/repos/asf/hive/blob/7f21a425/ql/src/java/org/apache/hadoop/hive/ql/io/ColumnarSplit.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/ColumnarSplit.java b/ql/src/java/org/apache/hadoop/hive/ql/io/ColumnarSplit.java
new file mode 100644
index 0000000..ed8fc35
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/ColumnarSplit.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.io;
+
+/**
+ * Interface when implemented should return the estimated size of columnar projections
+ * that will be read from the split. This information will be used by split grouper for better
+ * grouping based on the actual data read instead of the complete split length.
+ */
+public interface ColumnarSplit {
+
+  /**
+   * Return the estimation size of the column projections that will be read from this split.
+   *
+   * @return - estimated column projection size that will be read in bytes
+   */
+  long getColumnarProjectionSize();
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/7f21a425/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index 2548106..340f55c 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -1053,7 +1053,7 @@ public class OrcInputFormat  implements InputFormat<NullWritable, OrcStruct>,
     if (isDebugEnabled) {
       for (OrcSplit split : splits) {
         LOG.debug(split + " projected_columns_uncompressed_size: "
-            + split.getProjectedColumnsUncompressedSize());
+            + split.getColumnarProjectionSize());
       }
     }
     return splits;

http://git-wip-us.apache.org/repos/asf/hive/blob/7f21a425/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java
index 1263346..fa78703 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java
@@ -28,6 +28,7 @@ import java.util.List;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.io.ColumnarSplit;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.WritableUtils;
 import org.apache.hadoop.mapred.FileSplit;
@@ -38,7 +39,7 @@ import org.apache.hadoop.mapred.FileSplit;
  * OrcFileSplit. Holds file meta info
  *
  */
-public class OrcSplit extends FileSplit {
+public class OrcSplit extends FileSplit implements ColumnarSplit {
   private static final Log LOG = LogFactory.getLog(OrcSplit.class);
 
   private FileMetaInfo fileMetaInfo;
@@ -74,7 +75,7 @@ public class OrcSplit extends FileSplit {
     this.isOriginal = isOriginal;
     this.hasBase = hasBase;
     this.deltas.addAll(deltas);
-    this.projColsUncompressedSize = projectedDataSize;
+    this.projColsUncompressedSize = projectedDataSize <= 0 ? length : projectedDataSize;
   }
 
   @Override
@@ -173,7 +174,8 @@ public class OrcSplit extends FileSplit {
     return fileId;
   }
 
-  public long getProjectedColumnsUncompressedSize() {
+  @Override
+  public long getColumnarProjectionSize() {
     return projColsUncompressedSize;
   }