You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2016/07/01 17:56:22 UTC
[7/8] orc git commit: HIVE-13985. ORC improvements for reducing the
file system calls in the task side.
HIVE-13985. ORC improvements for reducing the file system calls in the task side.
Signed-off-by: Owen O'Malley <om...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/41208a78
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/41208a78
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/41208a78
Branch: refs/heads/branch-1.1
Commit: 41208a7835449741808ba8b01285db9859065f9c
Parents: 775bc2a
Author: Owen O'Malley <om...@apache.org>
Authored: Thu Jun 30 10:38:32 2016 -0700
Committer: Owen O'Malley <om...@apache.org>
Committed: Fri Jul 1 09:33:11 2016 -0700
----------------------------------------------------------------------
.../src/java/org/apache/orc/FileMetaInfo.java | 64 -----
java/core/src/java/org/apache/orc/OrcFile.java | 32 +--
java/core/src/java/org/apache/orc/OrcUtils.java | 11 +
java/core/src/java/org/apache/orc/Reader.java | 9 +-
.../src/java/org/apache/orc/impl/OrcTail.java | 140 +++++++++++
.../java/org/apache/orc/impl/ReaderImpl.java | 240 ++++++++++---------
proto/orc_proto.proto | 1 +
7 files changed, 304 insertions(+), 193 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/orc/blob/41208a78/java/core/src/java/org/apache/orc/FileMetaInfo.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/FileMetaInfo.java b/java/core/src/java/org/apache/orc/FileMetaInfo.java
deleted file mode 100644
index d3cac3b..0000000
--- a/java/core/src/java/org/apache/orc/FileMetaInfo.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc;
-
-import java.nio.ByteBuffer;
-import java.util.List;
-
-/**
- * FileMetaInfo - represents file metadata stored in footer and postscript sections of the file
- * that is useful for Reader implementation
- *
- */
-public class FileMetaInfo {
- public ByteBuffer footerMetaAndPsBuffer;
- public final String compressionType;
- public final int bufferSize;
- public final int metadataSize;
- public final ByteBuffer footerBuffer;
- public final List<Integer> versionList;
- public final OrcFile.WriterVersion writerVersion;
-
-
- /** Ctor used when reading splits - no version list or full footer buffer. */
- public FileMetaInfo(String compressionType, int bufferSize, int metadataSize,
- ByteBuffer footerBuffer, OrcFile.WriterVersion writerVersion) {
- this(compressionType, bufferSize, metadataSize, footerBuffer, null,
- writerVersion, null);
- }
-
- /** Ctor used when creating file info during init and when getting a new one. */
- public FileMetaInfo(String compressionType, int bufferSize, int metadataSize,
- ByteBuffer footerBuffer, List<Integer> versionList,
- OrcFile.WriterVersion writerVersion,
- ByteBuffer fullFooterBuffer) {
- this.compressionType = compressionType;
- this.bufferSize = bufferSize;
- this.metadataSize = metadataSize;
- this.footerBuffer = footerBuffer;
- this.versionList = versionList;
- this.writerVersion = writerVersion;
- this.footerMetaAndPsBuffer = fullFooterBuffer;
- }
-
- public OrcFile.WriterVersion getWriterVersion() {
- return writerVersion;
- }
-
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/orc/blob/41208a78/java/core/src/java/org/apache/orc/OrcFile.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/OrcFile.java b/java/core/src/java/org/apache/orc/OrcFile.java
index 7dd7333..ddfa9f7 100644
--- a/java/core/src/java/org/apache/orc/OrcFile.java
+++ b/java/core/src/java/org/apache/orc/OrcFile.java
@@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.orc.impl.MemoryManager;
+import org.apache.orc.impl.OrcTail;
import org.apache.orc.impl.ReaderImpl;
import org.apache.orc.impl.WriterImpl;
@@ -160,19 +161,17 @@ public class OrcFile {
public static class ReaderOptions {
private final Configuration conf;
private FileSystem filesystem;
- private FileMetaInfo fileMetaInfo; // TODO: this comes from some place.
private long maxLength = Long.MAX_VALUE;
- private FileMetadata fullFileMetadata; // Propagate from LLAP cache.
+ private OrcTail orcTail;
+ // TODO: We can generalize FileMetada interface. Make OrcTail implement FileMetadata interface
+ // and remove this class altogether. Both footer caching and llap caching just needs OrcTail.
+ // For now keeping this around to avoid complex surgery
+ private FileMetadata fileMetadata;
public ReaderOptions(Configuration conf) {
this.conf = conf;
}
- public ReaderOptions fileMetaInfo(FileMetaInfo info) {
- fileMetaInfo = info;
- return this;
- }
-
public ReaderOptions filesystem(FileSystem fs) {
this.filesystem = fs;
return this;
@@ -183,8 +182,8 @@ public class OrcFile {
return this;
}
- public ReaderOptions fileMetadata(FileMetadata metadata) {
- this.fullFileMetadata = metadata;
+ public ReaderOptions orcTail(OrcTail tail) {
+ this.orcTail = tail;
return this;
}
@@ -196,16 +195,21 @@ public class OrcFile {
return filesystem;
}
- public FileMetaInfo getFileMetaInfo() {
- return fileMetaInfo;
- }
-
public long getMaxLength() {
return maxLength;
}
+ public OrcTail getOrcTail() {
+ return orcTail;
+ }
+
+ public ReaderOptions fileMetadata(final FileMetadata metadata) {
+ fileMetadata = metadata;
+ return this;
+ }
+
public FileMetadata getFileMetadata() {
- return fullFileMetadata;
+ return fileMetadata;
}
}
http://git-wip-us.apache.org/repos/asf/orc/blob/41208a78/java/core/src/java/org/apache/orc/OrcUtils.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/OrcUtils.java b/java/core/src/java/org/apache/orc/OrcUtils.java
index 9dd7504..94493b3 100644
--- a/java/core/src/java/org/apache/orc/OrcUtils.java
+++ b/java/core/src/java/org/apache/orc/OrcUtils.java
@@ -17,6 +17,8 @@
*/
package org.apache.orc;
+import org.apache.orc.impl.ReaderImpl;
+
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@@ -525,4 +527,13 @@ public class OrcUtils {
}
throw new IllegalArgumentException("Unknown ORC type " + type.getKind());
}
+
+ public static List<StripeInformation> convertProtoStripesToStripes(
+ List<OrcProto.StripeInformation> stripes) {
+ List<StripeInformation> result = new ArrayList<StripeInformation>(stripes.size());
+ for (OrcProto.StripeInformation info : stripes) {
+ result.add(new ReaderImpl.StripeInformationImpl(info));
+ }
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/orc/blob/41208a78/java/core/src/java/org/apache/orc/Reader.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/Reader.java b/java/core/src/java/org/apache/orc/Reader.java
index 87f3293..c2d5235 100644
--- a/java/core/src/java/org/apache/orc/Reader.java
+++ b/java/core/src/java/org/apache/orc/Reader.java
@@ -138,6 +138,13 @@ public interface Reader {
OrcFile.WriterVersion getWriterVersion();
/**
+ * Get the file tail (footer + postscript)
+ *
+ * @return - file tail
+ */
+ OrcProto.FileTail getFileTail();
+
+ /**
* Options for creating a RecordReader.
*/
public static class Options {
@@ -354,7 +361,7 @@ public interface Reader {
/**
* @return Stripe statistics.
*/
- List<StripeStatistics> getStripeStatistics();
+ List<StripeStatistics> getStripeStatistics() throws IOException;
/**
* @return File statistics, in original protobuf form.
http://git-wip-us.apache.org/repos/asf/orc/blob/41208a78/java/core/src/java/org/apache/orc/impl/OrcTail.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/OrcTail.java b/java/core/src/java/org/apache/orc/impl/OrcTail.java
new file mode 100644
index 0000000..b5f85fb
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/impl/OrcTail.java
@@ -0,0 +1,140 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.impl;
+
+import static org.apache.orc.impl.ReaderImpl.extractMetadata;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.orc.CompressionCodec;
+import org.apache.orc.CompressionKind;
+import org.apache.orc.OrcFile;
+import org.apache.orc.OrcProto;
+import org.apache.orc.StripeInformation;
+import org.apache.orc.StripeStatistics;
+
+// TODO: Make OrcTail implement FileMetadata or Reader interface
+public final class OrcTail {
+ // postscript + footer - Serialized in OrcSplit
+ private final OrcProto.FileTail fileTail;
+ // serialized representation of metadata, footer and postscript
+ private final ByteBuffer serializedTail;
+ // used to invalidate cache entries
+ private final long fileModificationTime;
+ // lazily deserialized
+ private OrcProto.Metadata metadata;
+
+ public OrcTail(OrcProto.FileTail fileTail, ByteBuffer serializedTail) {
+ this(fileTail, serializedTail, -1);
+ }
+
+ public OrcTail(OrcProto.FileTail fileTail, ByteBuffer serializedTail, long fileModificationTime) {
+ this.fileTail = fileTail;
+ this.serializedTail = serializedTail;
+ this.fileModificationTime = fileModificationTime;
+ this.metadata = null;
+ }
+
+ public ByteBuffer getSerializedTail() {
+ return serializedTail;
+ }
+
+ public long getFileModificationTime() {
+ return fileModificationTime;
+ }
+
+ public OrcProto.Footer getFooter() {
+ return fileTail.getFooter();
+ }
+
+ public OrcProto.PostScript getPostScript() {
+ return fileTail.getPostscript();
+ }
+
+ public OrcFile.WriterVersion getWriterVersion() {
+ OrcProto.PostScript ps = fileTail.getPostscript();
+ return (ps.hasWriterVersion()
+ ? OrcFile.WriterVersion.from(ps.getWriterVersion()) : OrcFile.WriterVersion.ORIGINAL);
+ }
+
+ public List<StripeInformation> getStripes() {
+ List<StripeInformation> result = new ArrayList<>(fileTail.getFooter().getStripesCount());
+ for (OrcProto.StripeInformation stripeProto : fileTail.getFooter().getStripesList()) {
+ result.add(new ReaderImpl.StripeInformationImpl(stripeProto));
+ }
+ return result;
+ }
+
+ public CompressionKind getCompressionKind() {
+ return CompressionKind.valueOf(fileTail.getPostscript().getCompression().name());
+ }
+
+ public CompressionCodec getCompressionCodec() {
+ return WriterImpl.createCodec(getCompressionKind());
+ }
+
+ public int getCompressionBufferSize() {
+ return (int) fileTail.getPostscript().getCompressionBlockSize();
+ }
+
+ public List<StripeStatistics> getStripeStatistics() throws IOException {
+ List<StripeStatistics> result = new ArrayList<>();
+ List<OrcProto.StripeStatistics> ssProto = getStripeStatisticsProto();
+ if (ssProto != null) {
+ for (OrcProto.StripeStatistics ss : ssProto) {
+ result.add(new StripeStatistics(ss.getColStatsList()));
+ }
+ }
+ return result;
+ }
+
+ public List<OrcProto.StripeStatistics> getStripeStatisticsProto() throws IOException {
+ if (serializedTail == null) return null;
+ if (metadata == null) {
+ metadata = extractMetadata(serializedTail, 0,
+ (int) fileTail.getPostscript().getMetadataLength(),
+ getCompressionCodec(), getCompressionBufferSize());
+ // clear does not clear the contents but sets position to 0 and limit = capacity
+ serializedTail.clear();
+ }
+ return metadata.getStripeStatsList();
+ }
+
+ public int getMetadataSize() {
+ return (int) getPostScript().getMetadataLength();
+ }
+
+ public List<OrcProto.Type> getTypes() {
+ return getFooter().getTypesList();
+ }
+
+ public OrcProto.FileTail getFileTail() {
+ return fileTail;
+ }
+
+ public OrcProto.FileTail getMinimalFileTail() {
+ OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder(fileTail);
+ OrcProto.Footer.Builder footerBuilder = OrcProto.Footer.newBuilder(fileTail.getFooter());
+ footerBuilder.clearStatistics();
+ fileTailBuilder.setFooter(footerBuilder.build());
+ OrcProto.FileTail result = fileTailBuilder.build();
+ return result;
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/41208a78/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
index 7625d4a..a18f922 100644
--- a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
@@ -27,6 +27,9 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.orc.CompressionKind;
+import org.apache.orc.FileMetadata;
import org.apache.orc.OrcFile;
import org.apache.orc.OrcUtils;
import org.apache.orc.Reader;
@@ -35,8 +38,6 @@ import org.apache.orc.TypeDescription;
import org.apache.orc.ColumnStatistics;
import org.apache.orc.CompressionCodec;
import org.apache.orc.FileFormatException;
-import org.apache.orc.FileMetaInfo;
-import org.apache.orc.FileMetadata;
import org.apache.orc.StripeInformation;
import org.apache.orc.StripeStatistics;
import org.slf4j.Logger;
@@ -62,27 +63,25 @@ public class ReaderImpl implements Reader {
private final long maxLength;
protected final Path path;
protected final org.apache.orc.CompressionKind compressionKind;
- protected final CompressionCodec codec;
- protected final int bufferSize;
- private final List<OrcProto.StripeStatistics> stripeStats;
+ protected CompressionCodec codec;
+ protected int bufferSize;
+ protected OrcProto.Metadata metadata;
+ private List<OrcProto.StripeStatistics> stripeStats;
private final int metadataSize;
protected final List<OrcProto.Type> types;
- private final TypeDescription schema;
+ private TypeDescription schema;
private final List<OrcProto.UserMetadataItem> userMetadata;
private final List<OrcProto.ColumnStatistics> fileStats;
private final List<StripeInformation> stripes;
protected final int rowIndexStride;
private final long contentLength, numberOfRows;
-
private long deserializedSize = -1;
protected final Configuration conf;
private final List<Integer> versionList;
private final OrcFile.WriterVersion writerVersion;
- // Same for metastore cache - maintains the same background buffer, but includes postscript.
- // This will only be set if the file footer/metadata was read from disk.
- private final ByteBuffer footerMetaAndPsBuffer;
+ protected OrcTail tail;
public static class StripeInformationImpl
implements StripeInformation {
@@ -206,6 +205,11 @@ public class ReaderImpl implements Reader {
}
@Override
+ public OrcProto.FileTail getFileTail() {
+ return tail.getFileTail();
+ }
+
+ @Override
public int getRowIndexStride() {
return rowIndexStride;
}
@@ -260,6 +264,32 @@ public class ReaderImpl implements Reader {
}
/**
+ * Ensure this is an ORC file to prevent users from trying to read text
+ * files or RC files as ORC files.
+ * @param psLen the postscript length
+ * @param buffer the tail of the file
+ * @throws IOException
+ */
+ protected static void ensureOrcFooter(ByteBuffer buffer, int psLen) throws IOException {
+ int magicLength = OrcFile.MAGIC.length();
+ int fullLength = magicLength + 1;
+ if (psLen < fullLength || buffer.remaining() < fullLength) {
+ throw new FileFormatException("Malformed ORC file. Invalid postscript length " + psLen);
+ }
+
+ int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - fullLength;
+ byte[] array = buffer.array();
+ // now look for the magic string at the end of the postscript.
+ if (!Text.decode(array, offset, magicLength).equals(OrcFile.MAGIC)) {
+ // if it isn't there, this may be 0.11.0 version of the ORC file.
+ // Read the first 3 bytes from the buffer to check for the header
+ if (!Text.decode(buffer.array(), 0, magicLength).equals(OrcFile.MAGIC)) {
+ throw new FileFormatException("Malformed ORC file. Invalid postscript length " + psLen);
+ }
+ }
+ }
+
+ /**
* Build a version string out of an array.
* @param version the version number as a list
* @return the human readable form of the version string
@@ -315,7 +345,6 @@ public class ReaderImpl implements Reader {
this.path = path;
this.conf = options.getConfiguration();
this.maxLength = options.getMaxLength();
-
FileMetadata fileMetadata = options.getFileMetadata();
if (fileMetadata != null) {
this.compressionKind = fileMetadata.getCompressionKind();
@@ -333,38 +362,28 @@ public class ReaderImpl implements Reader {
this.fileStats = fileMetadata.getFileStats();
this.stripes = fileMetadata.getStripes();
this.userMetadata = null; // not cached and not needed here
- this.footerMetaAndPsBuffer = null;
} else {
- FileMetaInfo footerMetaData;
- if (options.getFileMetaInfo() != null) {
- footerMetaData = options.getFileMetaInfo();
- this.footerMetaAndPsBuffer = null;
+ OrcTail orcTail = options.getOrcTail();
+ if (orcTail == null) {
+ tail = extractFileTail(fs, path, options.getMaxLength());
+ options.orcTail(tail);
} else {
- footerMetaData = extractMetaInfoFromFooter(fs, path,
- options.getMaxLength());
- this.footerMetaAndPsBuffer = footerMetaData.footerMetaAndPsBuffer;
+ tail = orcTail;
}
- options.fileMetaInfo(footerMetaData);
- MetaInfoObjExtractor rInfo =
- new MetaInfoObjExtractor(footerMetaData.compressionType,
- footerMetaData.bufferSize,
- footerMetaData.metadataSize,
- footerMetaData.footerBuffer
- );
- this.compressionKind = rInfo.compressionKind;
- this.codec = rInfo.codec;
- this.bufferSize = rInfo.bufferSize;
- this.metadataSize = rInfo.metadataSize;
- this.stripeStats = rInfo.metadata.getStripeStatsList();
- this.types = rInfo.footer.getTypesList();
- this.rowIndexStride = rInfo.footer.getRowIndexStride();
- this.contentLength = rInfo.footer.getContentLength();
- this.numberOfRows = rInfo.footer.getNumberOfRows();
- this.userMetadata = rInfo.footer.getMetadataList();
- this.fileStats = rInfo.footer.getStatisticsList();
- this.versionList = footerMetaData.versionList;
- this.writerVersion = footerMetaData.writerVersion;
- this.stripes = convertProtoStripesToStripes(rInfo.footer.getStripesList());
+ this.compressionKind = tail.getCompressionKind();
+ this.codec = tail.getCompressionCodec();
+ this.bufferSize = tail.getCompressionBufferSize();
+ this.metadataSize = tail.getMetadataSize();
+ this.versionList = tail.getPostScript().getVersionList();
+ this.types = tail.getFooter().getTypesList();
+ this.rowIndexStride = tail.getFooter().getRowIndexStride();
+ this.contentLength = tail.getFooter().getContentLength();
+ this.numberOfRows = tail.getFooter().getNumberOfRows();
+ this.userMetadata = tail.getFooter().getMetadataList();
+ this.fileStats = tail.getFooter().getStatisticsList();
+ this.writerVersion = tail.getWriterVersion();
+ this.stripes = tail.getStripes();
+ this.stripeStats = tail.getStripeStatisticsProto();
}
this.schema = OrcUtils.convertTypeFromProtobuf(this.types, 0);
}
@@ -397,7 +416,7 @@ public class ReaderImpl implements Reader {
singleton(new BufferChunk(bb, 0)), footerSize, codec, bufferSize));
}
- private static OrcProto.Metadata extractMetadata(ByteBuffer bb, int metadataAbsPos,
+ public static OrcProto.Metadata extractMetadata(ByteBuffer bb, int metadataAbsPos,
int metadataSize, CompressionCodec codec, int bufferSize) throws IOException {
bb.position(metadataAbsPos);
bb.limit(metadataAbsPos + metadataSize);
@@ -430,22 +449,55 @@ public class ReaderImpl implements Reader {
return ps;
}
- private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs,
- Path path,
- long maxFileLength
- ) throws IOException {
+ public static OrcTail extractFileTail(ByteBuffer buffer)
+ throws IOException {
+ return extractFileTail(buffer, -1, -1);
+ }
+
+ public static OrcTail extractFileTail(ByteBuffer buffer, long fileLength, long modificationTime)
+ throws IOException {
+ int readSize = buffer.limit();
+ int psLen = buffer.get(readSize - 1) & 0xff;
+ int psOffset = readSize - 1 - psLen;
+ ensureOrcFooter(buffer, psLen);
+ byte[] psBuffer = new byte[psLen];
+ System.arraycopy(buffer.array(), psOffset, psBuffer, 0, psLen);
+ OrcProto.PostScript ps = OrcProto.PostScript.parseFrom(psBuffer);
+ int footerSize = (int) ps.getFooterLength();
+ CompressionCodec codec = WriterImpl
+ .createCodec(CompressionKind.valueOf(ps.getCompression().name()));
+ OrcProto.Footer footer = extractFooter(buffer,
+ (int) (buffer.position() + ps.getMetadataLength()),
+ footerSize, codec, (int) ps.getCompressionBlockSize());
+ OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder()
+ .setPostscriptLength(psLen)
+ .setPostscript(ps)
+ .setFooter(footer)
+ .setFileLength(fileLength);
+ // clear does not clear the contents but sets position to 0 and limit = capacity
+ buffer.clear();
+ return new OrcTail(fileTailBuilder.build(), buffer.slice(), modificationTime);
+ }
+
+ protected OrcTail extractFileTail(FileSystem fs, Path path,
+ long maxFileLength) throws IOException {
FSDataInputStream file = fs.open(path);
- ByteBuffer buffer = null, fullFooterBuffer = null;
- OrcProto.PostScript ps = null;
- OrcFile.WriterVersion writerVersion = null;
+ ByteBuffer buffer;
+ OrcProto.PostScript ps;
+ OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder();
+ long modificationTime;
try {
// figure out the size of the file using the option or filesystem
long size;
if (maxFileLength == Long.MAX_VALUE) {
- size = fs.getFileStatus(path).getLen();
+ FileStatus fileStatus = fs.getFileStatus(path);
+ size = fileStatus.getLen();
+ modificationTime = fileStatus.getModificationTime();
} else {
size = maxFileLength;
+ modificationTime = -1;
}
+ fileTailBuilder.setFileLength(size);
//read last bytes into buffer to get PostScript
int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS);
@@ -461,13 +513,16 @@ public class ReaderImpl implements Reader {
ensureOrcFooter(file, path, psLen, buffer);
int psOffset = readSize - 1 - psLen;
ps = extractPostScript(buffer, path, psLen, psOffset);
+ bufferSize = (int) ps.getCompressionBlockSize();
+ codec = WriterImpl.createCodec(CompressionKind.valueOf(ps.getCompression().name()));
+ fileTailBuilder.setPostscriptLength(psLen).setPostscript(ps);
int footerSize = (int) ps.getFooterLength();
int metadataSize = (int) ps.getMetadataLength();
- writerVersion = extractWriterVersion(ps);
//check if extra bytes need to be read
int extra = Math.max(0, psLen + 1 + footerSize + metadataSize - readSize);
+ int tailSize = 1 + psLen + footerSize + metadataSize;
if (extra > 0) {
//more bytes need to be read, seek back to the right place and read extra bytes
ByteBuffer extraBuf = ByteBuffer.allocate(extra + readSize);
@@ -478,17 +533,23 @@ public class ReaderImpl implements Reader {
extraBuf.put(buffer);
buffer = extraBuf;
buffer.position(0);
- fullFooterBuffer = buffer.slice();
- buffer.limit(footerSize + metadataSize);
+ buffer.limit(tailSize);
+ readSize += extra;
+ psOffset = readSize - 1 - psLen;
} else {
//footer is already in the bytes in buffer, just adjust position, length
buffer.position(psOffset - footerSize - metadataSize);
- fullFooterBuffer = buffer.slice();
- buffer.limit(psOffset);
+ buffer.limit(buffer.position() + tailSize);
}
- // remember position for later TODO: what later? this comment is useless
buffer.mark();
+ int footerOffset = psOffset - footerSize;
+ buffer.position(footerOffset);
+ ByteBuffer footerBuffer = buffer.slice();
+ buffer.reset();
+ OrcProto.Footer footer = extractFooter(footerBuffer, 0, footerSize,
+ codec, bufferSize);
+ fileTailBuilder.setFooter(footer);
} finally {
try {
file.close();
@@ -497,68 +558,15 @@ public class ReaderImpl implements Reader {
}
}
- return new FileMetaInfo(
- ps.getCompression().toString(),
- (int) ps.getCompressionBlockSize(),
- (int) ps.getMetadataLength(),
- buffer,
- ps.getVersionList(),
- writerVersion,
- fullFooterBuffer
- );
- }
-
- protected static OrcFile.WriterVersion extractWriterVersion(OrcProto.PostScript ps) {
- return (ps.hasWriterVersion()
- ? getWriterVersion(ps.getWriterVersion()) : OrcFile.WriterVersion.ORIGINAL);
- }
-
- protected static List<StripeInformation> convertProtoStripesToStripes(
- List<OrcProto.StripeInformation> stripes) {
- List<StripeInformation> result = new ArrayList<StripeInformation>(stripes.size());
- for (OrcProto.StripeInformation info : stripes) {
- result.add(new StripeInformationImpl(info));
- }
- return result;
- }
-
- /**
- * MetaInfoObjExtractor - has logic to create the values for the fields in ReaderImpl
- * from serialized fields.
- * As the fields are final, the fields need to be initialized in the constructor and
- * can't be done in some helper function. So this helper class is used instead.
- *
- */
- private static class MetaInfoObjExtractor{
- final org.apache.orc.CompressionKind compressionKind;
- final CompressionCodec codec;
- final int bufferSize;
- final int metadataSize;
- final OrcProto.Metadata metadata;
- final OrcProto.Footer footer;
-
- MetaInfoObjExtractor(String codecStr, int bufferSize, int metadataSize,
- ByteBuffer footerBuffer) throws IOException {
-
- this.compressionKind = org.apache.orc.CompressionKind.valueOf(codecStr.toUpperCase());
- this.bufferSize = bufferSize;
- this.codec = WriterImpl.createCodec(compressionKind);
- this.metadataSize = metadataSize;
-
- int position = footerBuffer.position();
- int footerBufferSize = footerBuffer.limit() - footerBuffer.position() - metadataSize;
-
- this.metadata = extractMetadata(footerBuffer, position, metadataSize, codec, bufferSize);
- this.footer = extractFooter(
- footerBuffer, position + metadataSize, footerBufferSize, codec, bufferSize);
-
- footerBuffer.position(position);
- }
+ ByteBuffer serializedTail = ByteBuffer.allocate(buffer.remaining());
+ serializedTail.put(buffer.slice());
+ serializedTail.rewind();
+ return new OrcTail(fileTailBuilder.build(), serializedTail, modificationTime);
}
@Override
public ByteBuffer getSerializedFileFooter() {
- return footerMetaAndPsBuffer;
+ return tail.getSerializedTail();
}
@Override
@@ -727,7 +735,11 @@ public class ReaderImpl implements Reader {
}
@Override
- public List<StripeStatistics> getStripeStatistics() {
+ public List<StripeStatistics> getStripeStatistics() throws IOException {
+ if (stripeStats == null && metadata == null) {
+ metadata = extractMetadata(tail.getSerializedTail(), 0, metadataSize, codec, bufferSize);
+ stripeStats = metadata.getStripeStatsList();
+ }
List<StripeStatistics> result = new ArrayList<>();
for (OrcProto.StripeStatistics ss : stripeStats) {
result.add(new StripeStatistics(ss.getColStatsList()));
http://git-wip-us.apache.org/repos/asf/orc/blob/41208a78/proto/orc_proto.proto
----------------------------------------------------------------------
diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto
index 6b7e597..dbc34ab 100644
--- a/proto/orc_proto.proto
+++ b/proto/orc_proto.proto
@@ -224,6 +224,7 @@ message PostScript {
}
// The contents of the file tail that must be serialized.
+// This gets serialized as part of OrcSplit, also used by footer cache.
message FileTail {
optional PostScript postscript = 1;
optional Footer footer = 2;