You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by pr...@apache.org on 2015/08/14 23:22:03 UTC
hive git commit: HIVE-11546: Projected columns read size should be scaled to split size for ORC Splits (Prasanth Jayachandran reviewed by Sergey Shelukhin)

Repository: hive
Updated Branches:
  refs/heads/master cf0481fcf -> e8b2c605a


HIVE-11546: Projected columns read size should be scaled to split size for ORC Splits (Prasanth Jayachandran reviewed by Sergey Shelukhin)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/e8b2c605
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/e8b2c605
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/e8b2c605

Branch: refs/heads/master
Commit: e8b2c605a05d06c1bfcd4c6bc611bc7f83306b38
Parents: cf0481f
Author: Prasanth Jayachandran <j....@gmail.com>
Authored: Fri Aug 14 14:21:51 2015 -0700
Committer: Prasanth Jayachandran <j....@gmail.com>
Committed: Fri Aug 14 14:21:51 2015 -0700

----------------------------------------------------------------------
 .../hadoop/hive/ql/io/orc/OrcInputFormat.java   | 30 ++++++-
 .../apache/hadoop/hive/ql/io/orc/Reader.java    |  7 ++
 .../hadoop/hive/ql/io/orc/ReaderImpl.java       |  5 +-
 .../hive/ql/io/orc/TestInputOutputFormat.java   | 95 ++++++++++++++++++--
 4 files changed, 125 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/e8b2c605/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index fe2eccd..6ed7872 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -717,6 +717,7 @@ public class OrcInputFormat  implements InputFormat<NullWritable, OrcStruct>,
     private ReaderImpl.FileMetaInfo fileMetaInfo;
     private Metadata metadata;
     private List<OrcProto.Type> types;
+    private boolean[] includedCols;
     private final boolean isOriginal;
     private final List<DeltaMetaData> deltas;
     private final boolean hasBase;
@@ -830,8 +831,14 @@ public class OrcInputFormat  implements InputFormat<NullWritable, OrcStruct>,
         hosts = new String[hostList.size()];
         hostList.toArray(hosts);
       }
+
+      // scale the raw data size to split level based on ratio of split wrt to file length
+      final long fileLen = file.getLen();
+      final double splitRatio = (double) length / (double) fileLen;
+      final long scaledProjSize = projColsUncompressedSize > 0 ?
+          (long) (splitRatio * projColsUncompressedSize) : fileLen;
       return new OrcSplit(file.getPath(), offset, length, hosts, fileMetaInfo,
-          isOriginal, hasBase, deltas, projColsUncompressedSize);
+          isOriginal, hasBase, deltas, scaledProjSize);
     }
 
     /**
@@ -845,11 +852,12 @@ public class OrcInputFormat  implements InputFormat<NullWritable, OrcStruct>,
 
       // figure out which stripes we need to read
       boolean[] includeStripe = null;
+
       // we can't eliminate stripes if there are deltas because the
       // deltas may change the rows making them match the predicate.
       if (deltas.isEmpty()) {
         Reader.Options options = new Reader.Options();
-        options.include(genIncludedColumns(types, context.conf, isOriginal));
+        options.include(includedCols);
         setSearchArgument(options, types, context.conf, isOriginal);
         // only do split pruning if HIVE-8732 has been fixed in the writer
         if (options.getSearchArgument() != null &&
@@ -930,8 +938,6 @@ public class OrcInputFormat  implements InputFormat<NullWritable, OrcStruct>,
     private void populateAndCacheStripeDetails() throws IOException {
       Reader orcReader = OrcFile.createReader(file.getPath(),
           OrcFile.readerOptions(context.conf).filesystem(fs));
-      List<String> projCols = ColumnProjectionUtils.getReadColumnNames(context.conf);
-      // TODO: produce projColsUncompressedSize from projCols
       if (fileInfo != null) {
         stripes = fileInfo.stripeInfos;
         fileMetaInfo = fileInfo.fileMetaInfo;
@@ -959,6 +965,22 @@ public class OrcInputFormat  implements InputFormat<NullWritable, OrcStruct>,
                   metadata, types, fileMetaInfo, writerVersion));
         }
       }
+      includedCols = genIncludedColumns(types, context.conf, isOriginal);
+      projColsUncompressedSize = computeProjectionSize(orcReader, includedCols, isOriginal);
+    }
+
+    private long computeProjectionSize(final Reader orcReader, final boolean[] includedCols,
+        final boolean isOriginal) {
+      final int rootIdx = getRootColumn(isOriginal);
+      List<Integer> internalColIds = Lists.newArrayList();
+      if (includedCols != null) {
+        for (int i = 0; i < includedCols.length; i++) {
+          if (includedCols[i]) {
+            internalColIds.add(rootIdx + i);
+          }
+        }
+      }
+      return orcReader.getRawDataSizeFromColIndices(internalColIds);
     }
 
     private boolean isStripeSatisfyPredicate(StripeStatistics stripeStatistics,

http://git-wip-us.apache.org/repos/asf/hive/blob/e8b2c605/ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java
index 6f4f013..7bddefc 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java
@@ -52,6 +52,13 @@ public interface Reader {
   long getRawDataSizeOfColumns(List<String> colNames);
 
   /**
+   * Get the deserialized data size of the specified columns ids
+   * @param colIds - internal column id (check orcfiledump for column ids)
+   * @return raw data size of columns
+   */
+  long getRawDataSizeFromColIndices(List<Integer> colIds);
+
+  /**
    * Get the user metadata keys.
    * @return the set of metadata keys
    */

http://git-wip-us.apache.org/repos/asf/hive/blob/e8b2c605/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
index 3c0de3c..a6448b6 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
@@ -575,7 +575,8 @@ public class ReaderImpl implements Reader {
     return deserializedSize;
   }
 
-  private long getRawDataSizeFromColIndices(List<Integer> colIndices) {
+  @Override
+  public long getRawDataSizeFromColIndices(List<Integer> colIndices) {
     long result = 0;
     for (int colIdx : colIndices) {
       result += getRawDataSizeOfColumn(colIdx);
@@ -620,7 +621,7 @@ public class ReaderImpl implements Reader {
     case BYTE:
       return numVals * JavaDataModel.get().primitive1();
     default:
-      LOG.debug("Unknown primitive category.");
+      LOG.debug("Unknown primitive category: " + type.getKind());
       break;
     }
 

http://git-wip-us.apache.org/repos/asf/hive/blob/e8b2c605/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
index 6cb8529..0c12c89 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
@@ -17,9 +17,6 @@
  */
 package org.apache.hadoop.hive.ql.io.orc;
 
-import com.esotericsoftware.kryo.Kryo;
-import com.esotericsoftware.kryo.io.Output;
-
 import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
@@ -108,6 +105,9 @@ import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TestName;
 
+import com.esotericsoftware.kryo.Kryo;
+import com.esotericsoftware.kryo.io.Output;
+
 public class TestInputOutputFormat {
 
   public static String toKryo(SearchArgument sarg) {
@@ -902,14 +902,25 @@ public class TestInputOutputFormat {
     }
     fill(buffer, offset);
     footer.addTypes(OrcProto.Type.newBuilder()
-                     .setKind(OrcProto.Type.Kind.STRUCT)
-                     .addFieldNames("col1")
-                     .addSubtypes(1));
+        .setKind(OrcProto.Type.Kind.STRUCT)
+        .addFieldNames("col1")
+        .addSubtypes(1));
     footer.addTypes(OrcProto.Type.newBuilder()
         .setKind(OrcProto.Type.Kind.STRING));
     footer.setNumberOfRows(1000 * stripeLengths.length)
           .setHeaderLength(headerLen)
           .setContentLength(offset - headerLen);
+    footer.addStatistics(OrcProto.ColumnStatistics.newBuilder()
+        .setNumberOfValues(1000 * stripeLengths.length).build());
+    footer.addStatistics(OrcProto.ColumnStatistics.newBuilder()
+        .setNumberOfValues(1000 * stripeLengths.length)
+        .setStringStatistics(
+            OrcProto.StringStatistics.newBuilder()
+                .setMaximum("zzz")
+                .setMinimum("aaa")
+                .setSum(1000 * 3 * stripeLengths.length)
+                .build()
+        ).build());
     footer.build().writeTo(buffer);
     int footerEnd = buffer.getLength();
     OrcProto.PostScript ps =
@@ -1013,6 +1024,78 @@ public class TestInputOutputFormat {
   }
 
   @Test
+  public void testProjectedColumnSize() throws Exception {
+    long[] stripeSizes =
+        new long[]{200, 200, 200, 200, 100};
+    MockFileSystem fs = new MockFileSystem(conf,
+        new MockFile("mock:/a/file", 500,
+            createMockOrcFile(stripeSizes),
+            new MockBlock("host1-1", "host1-2", "host1-3"),
+            new MockBlock("host2-1", "host0", "host2-3"),
+            new MockBlock("host0", "host3-2", "host3-3"),
+            new MockBlock("host4-1", "host4-2", "host4-3"),
+            new MockBlock("host5-1", "host5-2", "host5-3")));
+    conf.setInt(OrcInputFormat.MAX_SPLIT_SIZE, 300);
+    conf.setInt(OrcInputFormat.MIN_SPLIT_SIZE, 200);
+    conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
+    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
+    OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
+    OrcInputFormat.SplitGenerator splitter =
+        new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs,
+            fs.getFileStatus(new Path("/a/file")), null, true,
+            new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null));
+    List<OrcSplit> results = splitter.call();
+    OrcSplit result = results.get(0);
+    assertEquals(3, results.size());
+    assertEquals(3, result.getStart());
+    assertEquals(400, result.getLength());
+    assertEquals(167468, result.getProjectedColumnsUncompressedSize());
+    result = results.get(1);
+    assertEquals(403, result.getStart());
+    assertEquals(400, result.getLength());
+    assertEquals(167468, result.getProjectedColumnsUncompressedSize());
+    result = results.get(2);
+    assertEquals(803, result.getStart());
+    assertEquals(100, result.getLength());
+    assertEquals(41867, result.getProjectedColumnsUncompressedSize());
+
+    // test min = 0, max = 0 generates each stripe
+    conf.setInt(OrcInputFormat.MIN_SPLIT_SIZE, 0);
+    conf.setInt(OrcInputFormat.MAX_SPLIT_SIZE, 0);
+    context = new OrcInputFormat.Context(conf);
+    splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs,
+        fs.getFileStatus(new Path("/a/file")), null, true,
+        new ArrayList<AcidInputFormat.DeltaMetaData>(),
+        true, null, null));
+    results = splitter.call();
+    assertEquals(5, results.size());
+    for (int i = 0; i < stripeSizes.length; ++i) {
+      assertEquals("checking stripe " + i + " size",
+          stripeSizes[i], results.get(i).getLength());
+      if (i == stripeSizes.length - 1) {
+        assertEquals(41867, results.get(i).getProjectedColumnsUncompressedSize());
+      } else {
+        assertEquals(83734, results.get(i).getProjectedColumnsUncompressedSize());
+      }
+    }
+
+    // single split
+    conf.setInt(OrcInputFormat.MIN_SPLIT_SIZE, 100000);
+    conf.setInt(OrcInputFormat.MAX_SPLIT_SIZE, 1000);
+    context = new OrcInputFormat.Context(conf);
+    splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs,
+        fs.getFileStatus(new Path("/a/file")), null, true,
+        new ArrayList<AcidInputFormat.DeltaMetaData>(),
+        true, null, null));
+    results = splitter.call();
+    assertEquals(1, results.size());
+    result = results.get(0);
+    assertEquals(3, result.getStart());
+    assertEquals(900, result.getLength());
+    assertEquals(376804, result.getProjectedColumnsUncompressedSize());
+  }
+
+  @Test
   @SuppressWarnings("unchecked,deprecation")
   public void testInOutFormat() throws Exception {
     Properties properties = new Properties();