You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2016/06/30 22:00:19 UTC

[6/7] orc git commit: HIVE-13985. ORC improvements for reducing the file system calls in the task side.

HIVE-13985. ORC improvements for reducing the file system calls in the task side.


Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/13ee0b3c
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/13ee0b3c
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/13ee0b3c

Branch: refs/heads/master
Commit: 13ee0b3cdb10585b8a3c0799f8e7685472d8458e
Parents: 047265c
Author: Owen O'Malley <om...@apache.org>
Authored: Thu Jun 30 10:38:32 2016 -0700
Committer: Owen O'Malley <om...@apache.org>
Committed: Thu Jun 30 14:33:58 2016 -0700

----------------------------------------------------------------------
 .../src/java/org/apache/orc/FileMetaInfo.java   |  64 -----
 java/core/src/java/org/apache/orc/OrcFile.java  |  32 +--
 java/core/src/java/org/apache/orc/OrcUtils.java |  11 +
 java/core/src/java/org/apache/orc/Reader.java   |   9 +-
 .../src/java/org/apache/orc/impl/OrcTail.java   | 140 +++++++++++
 .../java/org/apache/orc/impl/ReaderImpl.java    | 240 ++++++++++---------
 proto/orc_proto.proto                           |   1 +
 7 files changed, 304 insertions(+), 193 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/orc/blob/13ee0b3c/java/core/src/java/org/apache/orc/FileMetaInfo.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/FileMetaInfo.java b/java/core/src/java/org/apache/orc/FileMetaInfo.java
deleted file mode 100644
index d3cac3b..0000000
--- a/java/core/src/java/org/apache/orc/FileMetaInfo.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc;
-
-import java.nio.ByteBuffer;
-import java.util.List;
-
-/**
- * FileMetaInfo - represents file metadata stored in footer and postscript sections of the file
- * that is useful for Reader implementation
- *
- */
-public class FileMetaInfo {
-  public ByteBuffer footerMetaAndPsBuffer;
-  public final String compressionType;
-  public final int bufferSize;
-  public final int metadataSize;
-  public final ByteBuffer footerBuffer;
-  public final List<Integer> versionList;
-  public final OrcFile.WriterVersion writerVersion;
-
-
-  /** Ctor used when reading splits - no version list or full footer buffer. */
-  public FileMetaInfo(String compressionType, int bufferSize, int metadataSize,
-      ByteBuffer footerBuffer, OrcFile.WriterVersion writerVersion) {
-    this(compressionType, bufferSize, metadataSize, footerBuffer, null,
-        writerVersion, null);
-  }
-
-  /** Ctor used when creating file info during init and when getting a new one. */
-  public FileMetaInfo(String compressionType, int bufferSize, int metadataSize,
-      ByteBuffer footerBuffer, List<Integer> versionList,
-                      OrcFile.WriterVersion writerVersion,
-      ByteBuffer fullFooterBuffer) {
-    this.compressionType = compressionType;
-    this.bufferSize = bufferSize;
-    this.metadataSize = metadataSize;
-    this.footerBuffer = footerBuffer;
-    this.versionList = versionList;
-    this.writerVersion = writerVersion;
-    this.footerMetaAndPsBuffer = fullFooterBuffer;
-  }
-
-  public OrcFile.WriterVersion getWriterVersion() {
-    return writerVersion;
-  }
-
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/orc/blob/13ee0b3c/java/core/src/java/org/apache/orc/OrcFile.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/OrcFile.java b/java/core/src/java/org/apache/orc/OrcFile.java
index 7dd7333..ddfa9f7 100644
--- a/java/core/src/java/org/apache/orc/OrcFile.java
+++ b/java/core/src/java/org/apache/orc/OrcFile.java
@@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.orc.impl.MemoryManager;
+import org.apache.orc.impl.OrcTail;
 import org.apache.orc.impl.ReaderImpl;
 import org.apache.orc.impl.WriterImpl;
 
@@ -160,19 +161,17 @@ public class OrcFile {
   public static class ReaderOptions {
     private final Configuration conf;
     private FileSystem filesystem;
-    private FileMetaInfo fileMetaInfo; // TODO: this comes from some place.
     private long maxLength = Long.MAX_VALUE;
-    private FileMetadata fullFileMetadata; // Propagate from LLAP cache.
+    private OrcTail orcTail;
+    // TODO: We can generalize FileMetada interface. Make OrcTail implement FileMetadata interface
+    // and remove this class altogether. Both footer caching and llap caching just needs OrcTail.
+    // For now keeping this around to avoid complex surgery
+    private FileMetadata fileMetadata;
 
     public ReaderOptions(Configuration conf) {
       this.conf = conf;
     }
 
-    public ReaderOptions fileMetaInfo(FileMetaInfo info) {
-      fileMetaInfo = info;
-      return this;
-    }
-
     public ReaderOptions filesystem(FileSystem fs) {
       this.filesystem = fs;
       return this;
@@ -183,8 +182,8 @@ public class OrcFile {
       return this;
     }
 
-    public ReaderOptions fileMetadata(FileMetadata metadata) {
-      this.fullFileMetadata = metadata;
+    public ReaderOptions orcTail(OrcTail tail) {
+      this.orcTail = tail;
       return this;
     }
 
@@ -196,16 +195,21 @@ public class OrcFile {
       return filesystem;
     }
 
-    public FileMetaInfo getFileMetaInfo() {
-      return fileMetaInfo;
-    }
-
     public long getMaxLength() {
       return maxLength;
     }
 
+    public OrcTail getOrcTail() {
+      return orcTail;
+    }
+
+    public ReaderOptions fileMetadata(final FileMetadata metadata) {
+      fileMetadata = metadata;
+      return this;
+    }
+
     public FileMetadata getFileMetadata() {
-      return fullFileMetadata;
+      return fileMetadata;
     }
   }
 

http://git-wip-us.apache.org/repos/asf/orc/blob/13ee0b3c/java/core/src/java/org/apache/orc/OrcUtils.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/OrcUtils.java b/java/core/src/java/org/apache/orc/OrcUtils.java
index 9dd7504..94493b3 100644
--- a/java/core/src/java/org/apache/orc/OrcUtils.java
+++ b/java/core/src/java/org/apache/orc/OrcUtils.java
@@ -17,6 +17,8 @@
  */
 package org.apache.orc;
 
+import org.apache.orc.impl.ReaderImpl;
+
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
@@ -525,4 +527,13 @@ public class OrcUtils {
     }
     throw new IllegalArgumentException("Unknown ORC type " + type.getKind());
   }
+
+  public static List<StripeInformation> convertProtoStripesToStripes(
+      List<OrcProto.StripeInformation> stripes) {
+    List<StripeInformation> result = new ArrayList<StripeInformation>(stripes.size());
+    for (OrcProto.StripeInformation info : stripes) {
+      result.add(new ReaderImpl.StripeInformationImpl(info));
+    }
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/orc/blob/13ee0b3c/java/core/src/java/org/apache/orc/Reader.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/Reader.java b/java/core/src/java/org/apache/orc/Reader.java
index 87f3293..c2d5235 100644
--- a/java/core/src/java/org/apache/orc/Reader.java
+++ b/java/core/src/java/org/apache/orc/Reader.java
@@ -138,6 +138,13 @@ public interface Reader {
   OrcFile.WriterVersion getWriterVersion();
 
   /**
+   * Get the file tail (footer + postscript)
+   *
+   * @return - file tail
+   */
+  OrcProto.FileTail getFileTail();
+
+  /**
    * Options for creating a RecordReader.
    */
   public static class Options {
@@ -354,7 +361,7 @@ public interface Reader {
   /**
    * @return Stripe statistics.
    */
-  List<StripeStatistics> getStripeStatistics();
+  List<StripeStatistics> getStripeStatistics() throws IOException;
 
   /**
    * @return File statistics, in original protobuf form.

http://git-wip-us.apache.org/repos/asf/orc/blob/13ee0b3c/java/core/src/java/org/apache/orc/impl/OrcTail.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/OrcTail.java b/java/core/src/java/org/apache/orc/impl/OrcTail.java
new file mode 100644
index 0000000..b5f85fb
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/impl/OrcTail.java
@@ -0,0 +1,140 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.impl;
+
+import static org.apache.orc.impl.ReaderImpl.extractMetadata;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.orc.CompressionCodec;
+import org.apache.orc.CompressionKind;
+import org.apache.orc.OrcFile;
+import org.apache.orc.OrcProto;
+import org.apache.orc.StripeInformation;
+import org.apache.orc.StripeStatistics;
+
+// TODO: Make OrcTail implement FileMetadata or Reader interface
+public final class OrcTail {
+  // postscript + footer - Serialized in OrcSplit
+  private final OrcProto.FileTail fileTail;
+  // serialized representation of metadata, footer and postscript
+  private final ByteBuffer serializedTail;
+  // used to invalidate cache entries
+  private final long fileModificationTime;
+  // lazily deserialized
+  private OrcProto.Metadata metadata;
+
+  public OrcTail(OrcProto.FileTail fileTail, ByteBuffer serializedTail) {
+    this(fileTail, serializedTail, -1);
+  }
+
+  public OrcTail(OrcProto.FileTail fileTail, ByteBuffer serializedTail, long fileModificationTime) {
+    this.fileTail = fileTail;
+    this.serializedTail = serializedTail;
+    this.fileModificationTime = fileModificationTime;
+    this.metadata = null;
+  }
+
+  public ByteBuffer getSerializedTail() {
+    return serializedTail;
+  }
+
+  public long getFileModificationTime() {
+    return fileModificationTime;
+  }
+
+  public OrcProto.Footer getFooter() {
+    return fileTail.getFooter();
+  }
+
+  public OrcProto.PostScript getPostScript() {
+    return fileTail.getPostscript();
+  }
+
+  public OrcFile.WriterVersion getWriterVersion() {
+    OrcProto.PostScript ps = fileTail.getPostscript();
+    return (ps.hasWriterVersion()
+        ? OrcFile.WriterVersion.from(ps.getWriterVersion()) : OrcFile.WriterVersion.ORIGINAL);
+  }
+
+  public List<StripeInformation> getStripes() {
+    List<StripeInformation> result = new ArrayList<>(fileTail.getFooter().getStripesCount());
+    for (OrcProto.StripeInformation stripeProto : fileTail.getFooter().getStripesList()) {
+      result.add(new ReaderImpl.StripeInformationImpl(stripeProto));
+    }
+    return result;
+  }
+
+  public CompressionKind getCompressionKind() {
+    return CompressionKind.valueOf(fileTail.getPostscript().getCompression().name());
+  }
+
+  public CompressionCodec getCompressionCodec() {
+    return WriterImpl.createCodec(getCompressionKind());
+  }
+
+  public int getCompressionBufferSize() {
+    return (int) fileTail.getPostscript().getCompressionBlockSize();
+  }
+
+  public List<StripeStatistics> getStripeStatistics() throws IOException {
+    List<StripeStatistics> result = new ArrayList<>();
+    List<OrcProto.StripeStatistics> ssProto = getStripeStatisticsProto();
+    if (ssProto != null) {
+      for (OrcProto.StripeStatistics ss : ssProto) {
+        result.add(new StripeStatistics(ss.getColStatsList()));
+      }
+    }
+    return result;
+  }
+
+  public List<OrcProto.StripeStatistics> getStripeStatisticsProto() throws IOException {
+    if (serializedTail == null) return null;
+    if (metadata == null) {
+      metadata = extractMetadata(serializedTail, 0,
+          (int) fileTail.getPostscript().getMetadataLength(),
+          getCompressionCodec(), getCompressionBufferSize());
+      // clear does not clear the contents but sets position to 0 and limit = capacity
+      serializedTail.clear();
+    }
+    return metadata.getStripeStatsList();
+  }
+
+  public int getMetadataSize() {
+    return (int) getPostScript().getMetadataLength();
+  }
+
+  public List<OrcProto.Type> getTypes() {
+    return getFooter().getTypesList();
+  }
+
+  public OrcProto.FileTail getFileTail() {
+    return fileTail;
+  }
+
+  public OrcProto.FileTail getMinimalFileTail() {
+    OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder(fileTail);
+    OrcProto.Footer.Builder footerBuilder = OrcProto.Footer.newBuilder(fileTail.getFooter());
+    footerBuilder.clearStatistics();
+    fileTailBuilder.setFooter(footerBuilder.build());
+    OrcProto.FileTail result = fileTailBuilder.build();
+    return result;
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/13ee0b3c/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
index 7625d4a..a18f922 100644
--- a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
@@ -27,6 +27,9 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.orc.CompressionKind;
+import org.apache.orc.FileMetadata;
 import org.apache.orc.OrcFile;
 import org.apache.orc.OrcUtils;
 import org.apache.orc.Reader;
@@ -35,8 +38,6 @@ import org.apache.orc.TypeDescription;
 import org.apache.orc.ColumnStatistics;
 import org.apache.orc.CompressionCodec;
 import org.apache.orc.FileFormatException;
-import org.apache.orc.FileMetaInfo;
-import org.apache.orc.FileMetadata;
 import org.apache.orc.StripeInformation;
 import org.apache.orc.StripeStatistics;
 import org.slf4j.Logger;
@@ -62,27 +63,25 @@ public class ReaderImpl implements Reader {
   private final long maxLength;
   protected final Path path;
   protected final org.apache.orc.CompressionKind compressionKind;
-  protected final CompressionCodec codec;
-  protected final int bufferSize;
-  private final List<OrcProto.StripeStatistics> stripeStats;
+  protected CompressionCodec codec;
+  protected int bufferSize;
+  protected OrcProto.Metadata metadata;
+  private List<OrcProto.StripeStatistics> stripeStats;
   private final int metadataSize;
   protected final List<OrcProto.Type> types;
-  private final TypeDescription schema;
+  private TypeDescription schema;
   private final List<OrcProto.UserMetadataItem> userMetadata;
   private final List<OrcProto.ColumnStatistics> fileStats;
   private final List<StripeInformation> stripes;
   protected final int rowIndexStride;
   private final long contentLength, numberOfRows;
 
-
   private long deserializedSize = -1;
   protected final Configuration conf;
   private final List<Integer> versionList;
   private final OrcFile.WriterVersion writerVersion;
 
-  // Same for metastore cache - maintains the same background buffer, but includes postscript.
-  // This will only be set if the file footer/metadata was read from disk.
-  private final ByteBuffer footerMetaAndPsBuffer;
+  protected OrcTail tail;
 
   public static class StripeInformationImpl
       implements StripeInformation {
@@ -206,6 +205,11 @@ public class ReaderImpl implements Reader {
   }
 
   @Override
+  public OrcProto.FileTail getFileTail() {
+    return tail.getFileTail();
+  }
+
+  @Override
   public int getRowIndexStride() {
     return rowIndexStride;
   }
@@ -260,6 +264,32 @@ public class ReaderImpl implements Reader {
   }
 
   /**
+   * Ensure this is an ORC file to prevent users from trying to read text
+   * files or RC files as ORC files.
+   * @param psLen the postscript length
+   * @param buffer the tail of the file
+   * @throws IOException
+   */
+  protected static void ensureOrcFooter(ByteBuffer buffer, int psLen) throws IOException {
+    int magicLength = OrcFile.MAGIC.length();
+    int fullLength = magicLength + 1;
+    if (psLen < fullLength || buffer.remaining() < fullLength) {
+      throw new FileFormatException("Malformed ORC file. Invalid postscript length " + psLen);
+    }
+
+    int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - fullLength;
+    byte[] array = buffer.array();
+    // now look for the magic string at the end of the postscript.
+    if (!Text.decode(array, offset, magicLength).equals(OrcFile.MAGIC)) {
+      // if it isn't there, this may be 0.11.0 version of the ORC file.
+      // Read the first 3 bytes from the buffer to check for the header
+      if (!Text.decode(buffer.array(), 0, magicLength).equals(OrcFile.MAGIC)) {
+        throw new FileFormatException("Malformed ORC file. Invalid postscript length " + psLen);
+      }
+    }
+  }
+
+  /**
    * Build a version string out of an array.
    * @param version the version number as a list
    * @return the human readable form of the version string
@@ -315,7 +345,6 @@ public class ReaderImpl implements Reader {
     this.path = path;
     this.conf = options.getConfiguration();
     this.maxLength = options.getMaxLength();
-
     FileMetadata fileMetadata = options.getFileMetadata();
     if (fileMetadata != null) {
       this.compressionKind = fileMetadata.getCompressionKind();
@@ -333,38 +362,28 @@ public class ReaderImpl implements Reader {
       this.fileStats = fileMetadata.getFileStats();
       this.stripes = fileMetadata.getStripes();
       this.userMetadata = null; // not cached and not needed here
-      this.footerMetaAndPsBuffer = null;
     } else {
-      FileMetaInfo footerMetaData;
-      if (options.getFileMetaInfo() != null) {
-        footerMetaData = options.getFileMetaInfo();
-        this.footerMetaAndPsBuffer = null;
+      OrcTail orcTail = options.getOrcTail();
+      if (orcTail == null) {
+        tail = extractFileTail(fs, path, options.getMaxLength());
+        options.orcTail(tail);
       } else {
-        footerMetaData = extractMetaInfoFromFooter(fs, path,
-            options.getMaxLength());
-        this.footerMetaAndPsBuffer = footerMetaData.footerMetaAndPsBuffer;
+        tail = orcTail;
       }
-      options.fileMetaInfo(footerMetaData);
-      MetaInfoObjExtractor rInfo =
-          new MetaInfoObjExtractor(footerMetaData.compressionType,
-                                   footerMetaData.bufferSize,
-                                   footerMetaData.metadataSize,
-                                   footerMetaData.footerBuffer
-                                   );
-      this.compressionKind = rInfo.compressionKind;
-      this.codec = rInfo.codec;
-      this.bufferSize = rInfo.bufferSize;
-      this.metadataSize = rInfo.metadataSize;
-      this.stripeStats = rInfo.metadata.getStripeStatsList();
-      this.types = rInfo.footer.getTypesList();
-      this.rowIndexStride = rInfo.footer.getRowIndexStride();
-      this.contentLength = rInfo.footer.getContentLength();
-      this.numberOfRows = rInfo.footer.getNumberOfRows();
-      this.userMetadata = rInfo.footer.getMetadataList();
-      this.fileStats = rInfo.footer.getStatisticsList();
-      this.versionList = footerMetaData.versionList;
-      this.writerVersion = footerMetaData.writerVersion;
-      this.stripes = convertProtoStripesToStripes(rInfo.footer.getStripesList());
+      this.compressionKind = tail.getCompressionKind();
+      this.codec = tail.getCompressionCodec();
+      this.bufferSize = tail.getCompressionBufferSize();
+      this.metadataSize = tail.getMetadataSize();
+      this.versionList = tail.getPostScript().getVersionList();
+      this.types = tail.getFooter().getTypesList();
+      this.rowIndexStride = tail.getFooter().getRowIndexStride();
+      this.contentLength = tail.getFooter().getContentLength();
+      this.numberOfRows = tail.getFooter().getNumberOfRows();
+      this.userMetadata = tail.getFooter().getMetadataList();
+      this.fileStats = tail.getFooter().getStatisticsList();
+      this.writerVersion = tail.getWriterVersion();
+      this.stripes = tail.getStripes();
+      this.stripeStats = tail.getStripeStatisticsProto();
     }
     this.schema = OrcUtils.convertTypeFromProtobuf(this.types, 0);
   }
@@ -397,7 +416,7 @@ public class ReaderImpl implements Reader {
         singleton(new BufferChunk(bb, 0)), footerSize, codec, bufferSize));
   }
 
-  private static OrcProto.Metadata extractMetadata(ByteBuffer bb, int metadataAbsPos,
+  public static OrcProto.Metadata extractMetadata(ByteBuffer bb, int metadataAbsPos,
       int metadataSize, CompressionCodec codec, int bufferSize) throws IOException {
     bb.position(metadataAbsPos);
     bb.limit(metadataAbsPos + metadataSize);
@@ -430,22 +449,55 @@ public class ReaderImpl implements Reader {
     return ps;
   }
 
-  private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs,
-                                                        Path path,
-                                                        long maxFileLength
-                                                        ) throws IOException {
+  public static OrcTail extractFileTail(ByteBuffer buffer)
+      throws IOException {
+    return extractFileTail(buffer, -1, -1);
+  }
+
+  public static OrcTail extractFileTail(ByteBuffer buffer, long fileLength, long modificationTime)
+      throws IOException {
+    int readSize = buffer.limit();
+    int psLen = buffer.get(readSize - 1) & 0xff;
+    int psOffset = readSize - 1 - psLen;
+    ensureOrcFooter(buffer, psLen);
+    byte[] psBuffer = new byte[psLen];
+    System.arraycopy(buffer.array(), psOffset, psBuffer, 0, psLen);
+    OrcProto.PostScript ps = OrcProto.PostScript.parseFrom(psBuffer);
+    int footerSize = (int) ps.getFooterLength();
+    CompressionCodec codec = WriterImpl
+        .createCodec(CompressionKind.valueOf(ps.getCompression().name()));
+    OrcProto.Footer footer = extractFooter(buffer,
+        (int) (buffer.position() + ps.getMetadataLength()),
+        footerSize, codec, (int) ps.getCompressionBlockSize());
+    OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder()
+        .setPostscriptLength(psLen)
+        .setPostscript(ps)
+        .setFooter(footer)
+        .setFileLength(fileLength);
+    // clear does not clear the contents but sets position to 0 and limit = capacity
+    buffer.clear();
+    return new OrcTail(fileTailBuilder.build(), buffer.slice(), modificationTime);
+  }
+
+  protected OrcTail extractFileTail(FileSystem fs, Path path,
+      long maxFileLength) throws IOException {
     FSDataInputStream file = fs.open(path);
-    ByteBuffer buffer = null, fullFooterBuffer = null;
-    OrcProto.PostScript ps = null;
-    OrcFile.WriterVersion writerVersion = null;
+    ByteBuffer buffer;
+    OrcProto.PostScript ps;
+    OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder();
+    long modificationTime;
     try {
       // figure out the size of the file using the option or filesystem
       long size;
       if (maxFileLength == Long.MAX_VALUE) {
-        size = fs.getFileStatus(path).getLen();
+        FileStatus fileStatus = fs.getFileStatus(path);
+        size = fileStatus.getLen();
+        modificationTime = fileStatus.getModificationTime();
       } else {
         size = maxFileLength;
+        modificationTime = -1;
       }
+      fileTailBuilder.setFileLength(size);
 
       //read last bytes into buffer to get PostScript
       int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS);
@@ -461,13 +513,16 @@ public class ReaderImpl implements Reader {
       ensureOrcFooter(file, path, psLen, buffer);
       int psOffset = readSize - 1 - psLen;
       ps = extractPostScript(buffer, path, psLen, psOffset);
+      bufferSize = (int) ps.getCompressionBlockSize();
+      codec = WriterImpl.createCodec(CompressionKind.valueOf(ps.getCompression().name()));
+      fileTailBuilder.setPostscriptLength(psLen).setPostscript(ps);
 
       int footerSize = (int) ps.getFooterLength();
       int metadataSize = (int) ps.getMetadataLength();
-      writerVersion = extractWriterVersion(ps);
 
       //check if extra bytes need to be read
       int extra = Math.max(0, psLen + 1 + footerSize + metadataSize - readSize);
+      int tailSize = 1 + psLen + footerSize + metadataSize;
       if (extra > 0) {
         //more bytes need to be read, seek back to the right place and read extra bytes
         ByteBuffer extraBuf = ByteBuffer.allocate(extra + readSize);
@@ -478,17 +533,23 @@ public class ReaderImpl implements Reader {
         extraBuf.put(buffer);
         buffer = extraBuf;
         buffer.position(0);
-        fullFooterBuffer = buffer.slice();
-        buffer.limit(footerSize + metadataSize);
+        buffer.limit(tailSize);
+        readSize += extra;
+        psOffset = readSize - 1 - psLen;
       } else {
         //footer is already in the bytes in buffer, just adjust position, length
         buffer.position(psOffset - footerSize - metadataSize);
-        fullFooterBuffer = buffer.slice();
-        buffer.limit(psOffset);
+        buffer.limit(buffer.position() + tailSize);
       }
 
-      // remember position for later TODO: what later? this comment is useless
       buffer.mark();
+      int footerOffset = psOffset - footerSize;
+      buffer.position(footerOffset);
+      ByteBuffer footerBuffer = buffer.slice();
+      buffer.reset();
+      OrcProto.Footer footer = extractFooter(footerBuffer, 0, footerSize,
+          codec, bufferSize);
+      fileTailBuilder.setFooter(footer);
     } finally {
       try {
         file.close();
@@ -497,68 +558,15 @@ public class ReaderImpl implements Reader {
       }
     }
 
-    return new FileMetaInfo(
-        ps.getCompression().toString(),
-        (int) ps.getCompressionBlockSize(),
-        (int) ps.getMetadataLength(),
-        buffer,
-        ps.getVersionList(),
-        writerVersion,
-        fullFooterBuffer
-        );
-  }
-
-  protected static OrcFile.WriterVersion extractWriterVersion(OrcProto.PostScript ps) {
-    return (ps.hasWriterVersion()
-        ? getWriterVersion(ps.getWriterVersion()) : OrcFile.WriterVersion.ORIGINAL);
-  }
-
-  protected static List<StripeInformation> convertProtoStripesToStripes(
-      List<OrcProto.StripeInformation> stripes) {
-    List<StripeInformation> result = new ArrayList<StripeInformation>(stripes.size());
-    for (OrcProto.StripeInformation info : stripes) {
-      result.add(new StripeInformationImpl(info));
-    }
-    return result;
-  }
-
-  /**
-   * MetaInfoObjExtractor - has logic to create the values for the fields in ReaderImpl
-   *  from serialized fields.
-   * As the fields are final, the fields need to be initialized in the constructor and
-   *  can't be done in some helper function. So this helper class is used instead.
-   *
-   */
-  private static class MetaInfoObjExtractor{
-    final org.apache.orc.CompressionKind compressionKind;
-    final CompressionCodec codec;
-    final int bufferSize;
-    final int metadataSize;
-    final OrcProto.Metadata metadata;
-    final OrcProto.Footer footer;
-
-    MetaInfoObjExtractor(String codecStr, int bufferSize, int metadataSize, 
-        ByteBuffer footerBuffer) throws IOException {
-
-      this.compressionKind = org.apache.orc.CompressionKind.valueOf(codecStr.toUpperCase());
-      this.bufferSize = bufferSize;
-      this.codec = WriterImpl.createCodec(compressionKind);
-      this.metadataSize = metadataSize;
-
-      int position = footerBuffer.position();
-      int footerBufferSize = footerBuffer.limit() - footerBuffer.position() - metadataSize;
-
-      this.metadata = extractMetadata(footerBuffer, position, metadataSize, codec, bufferSize);
-      this.footer = extractFooter(
-          footerBuffer, position + metadataSize, footerBufferSize, codec, bufferSize);
-
-      footerBuffer.position(position);
-    }
+    ByteBuffer serializedTail = ByteBuffer.allocate(buffer.remaining());
+    serializedTail.put(buffer.slice());
+    serializedTail.rewind();
+    return new OrcTail(fileTailBuilder.build(), serializedTail, modificationTime);
   }
 
   @Override
   public ByteBuffer getSerializedFileFooter() {
-    return footerMetaAndPsBuffer;
+    return tail.getSerializedTail();
   }
 
   @Override
@@ -727,7 +735,11 @@ public class ReaderImpl implements Reader {
   }
 
   @Override
-  public List<StripeStatistics> getStripeStatistics() {
+  public List<StripeStatistics> getStripeStatistics() throws IOException {
+    if (stripeStats == null && metadata == null) {
+      metadata = extractMetadata(tail.getSerializedTail(), 0, metadataSize, codec, bufferSize);
+      stripeStats = metadata.getStripeStatsList();
+    }
     List<StripeStatistics> result = new ArrayList<>();
     for (OrcProto.StripeStatistics ss : stripeStats) {
       result.add(new StripeStatistics(ss.getColStatsList()));

http://git-wip-us.apache.org/repos/asf/orc/blob/13ee0b3c/proto/orc_proto.proto
----------------------------------------------------------------------
diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto
index 6b7e597..dbc34ab 100644
--- a/proto/orc_proto.proto
+++ b/proto/orc_proto.proto
@@ -224,6 +224,7 @@ message PostScript {
 }
 
 // The contents of the file tail that must be serialized.
+// This gets serialized as part of OrcSplit, also used by footer cache.
 message FileTail {
   optional PostScript postscript = 1;
   optional Footer footer = 2;