You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2016/01/19 22:12:22 UTC
orc git commit: ORC-8. Reimplement file-metadata to use the reader
API. (omalley reviewed by asandryh)
Repository: orc
Updated Branches:
refs/heads/master 6fa860f2e -> aad3581bc
ORC-8. Reimplement file-metadata to use the reader API. (omalley reviewed by
asandryh)
Signed-off-by: Owen O'Malley <om...@apache.org>
Fixes apache/orc#15
Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/aad3581b
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/aad3581b
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/aad3581b
Branch: refs/heads/master
Commit: aad3581bc9f66d8f16eacaf87949f743e548f20d
Parents: 6fa860f
Author: Owen O'Malley <om...@apache.org>
Authored: Fri Jan 15 12:48:41 2016 -0800
Committer: Owen O'Malley <om...@apache.org>
Committed: Tue Jan 19 13:11:34 2016 -0800
----------------------------------------------------------------------
c++/include/orc/Reader.hh | 114 +++++++++++++++-
c++/src/Reader.cc | 301 ++++++++++++++++++++++++++++++++++++-----
proto/orc_proto.proto | 3 +-
tools/src/FileMetadata.cc | 287 +++++++++++++++++++--------------------
4 files changed, 518 insertions(+), 187 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/orc/blob/aad3581b/c++/include/orc/Reader.hh
----------------------------------------------------------------------
diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh
index d924fbf..b6c5480 100644
--- a/c++/include/orc/Reader.hh
+++ b/c++/include/orc/Reader.hh
@@ -40,6 +40,22 @@ namespace orc {
};
/**
+ * Get the name of the CompressionKind.
+ */
+ std::string compressionKindToString(CompressionKind kind);
+
+ enum WriterVersion {
+ WriterVersion_ORIGINAL = 0,
+ WriterVersion_HIVE_8732 = 1,
+ WriterVersion_HIVE_4243 = 2
+ };
+
+ /**
+ * Get the name of the WriterVersion.
+ */
+ std::string writerVersionToString(WriterVersion kind);
+
+ /**
* Statistics that are available for all types of columns.
*/
class ColumnStatistics {
@@ -337,6 +353,41 @@ namespace orc {
virtual int64_t getMaximum() const = 0;
};
+ enum StreamKind {
+ StreamKind_PRESENT = 0,
+ StreamKind_DATA = 1,
+ StreamKind_LENGTH = 2,
+ StreamKind_DICTIONARY_DATA = 3,
+ StreamKind_DICTIONARY_COUNT = 4,
+ StreamKind_SECONDARY = 5,
+ StreamKind_ROW_INDEX = 6,
+ StreamKind_BLOOM_FILTER = 7
+ };
+
+ /**
+ * Get the string representation of the StreamKind.
+ */
+ std::string streamKindToString(StreamKind kind);
+
+ class StreamInformation {
+ public:
+ virtual ~StreamInformation();
+
+ virtual StreamKind getKind() const = 0;
+ virtual uint64_t getColumnId() const = 0;
+ virtual uint64_t getOffset() const = 0;
+ virtual uint64_t getLength() const = 0;
+ };
+
+ enum ColumnEncodingKind {
+ ColumnEncodingKind_DIRECT = 0,
+ ColumnEncodingKind_DICTIONARY = 1,
+ ColumnEncodingKind_DIRECT_V2 = 2,
+ ColumnEncodingKind_DICTIONARY_V2 = 3
+ };
+
+ std::string columnEncodingKindToString(ColumnEncodingKind kind);
+
class StripeInformation {
public:
virtual ~StripeInformation();
@@ -376,6 +427,35 @@ namespace orc {
* @return a count of the number of rows
*/
virtual uint64_t getNumberOfRows() const = 0;
+
+ /**
+ * Get the number of streams in the stripe.
+ */
+ virtual uint64_t getNumberOfStreams() const = 0;
+
+ /**
+ * Get the StreamInformation for the given stream.
+ */
+ virtual ORC_UNIQUE_PTR<StreamInformation>
+ getStreamInformation(uint64_t streamId) const = 0;
+
+ /**
+ * Get the column encoding for the given column.
+ * @param colId the columnId
+ */
+ virtual ColumnEncodingKind getColumnEncoding(uint64_t colId) const = 0;
+
+ /**
+ * Get the dictionary size.
+ * @param colId the columnId
+ * @return the size of the dictionary or 0 if there isn't one
+ */
+ virtual uint64_t getDictionarySize(uint64_t colId) const = 0;
+
+ /**
+ * Get the writer timezone.
+ */
+ virtual const std::string& getWriterTimezone() const = 0;
};
class Statistics {
@@ -616,6 +696,12 @@ namespace orc {
virtual uint64_t getCompressionSize() const = 0;
/**
+ * Get the version of the writer.
+ * @return the version of the writer.
+ */
+ virtual WriterVersion getWriterVersion() const = 0;
+
+ /**
* Get the number of rows per a entry in the row index.
* @return the number of rows per an entry in the row index or 0 if there
* is no row index.
@@ -651,12 +737,36 @@ namespace orc {
getStripeStatistics(uint64_t stripeIndex) const = 0;
/**
- * Get the length of the file.
- * @return the number of bytes in the file
+ * Get the length of the data stripes in the file.
+ * @return the number of bytes in stripes
*/
virtual uint64_t getContentLength() const = 0;
/**
+ * Get the length of the file stripe statistics
+ * @return the number of compressed bytes in the file stripe statistics
+ */
+ virtual uint64_t getStripeStatisticsLength() const = 0;
+
+ /**
+ * Get the length of the file footer
+ * @return the number of compressed bytes in the file footer
+ */
+ virtual uint64_t getFileFooterLength() const = 0;
+
+ /**
+ * Get the length of the file postscript
+ * @return the number of bytes in the file postscript
+ */
+ virtual uint64_t getFilePostscriptLength() const = 0;
+
+ /**
+ * Get the total length of the file.
+ * @return the number of bytes in the file
+ */
+ virtual uint64_t getFileLength() const = 0;
+
+ /**
* Get the statistics about the columns in the file.
* @return the information about the column
*/
http://git-wip-us.apache.org/repos/asf/orc/blob/aad3581b/c++/src/Reader.cc
----------------------------------------------------------------------
diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc
index 940ef16..29bd439 100644
--- a/c++/src/Reader.cc
+++ b/c++/src/Reader.cc
@@ -38,6 +38,36 @@
namespace orc {
+ std::string compressionKindToString(CompressionKind kind) {
+ switch (kind) {
+ case CompressionKind_NONE:
+ return "none";
+ case CompressionKind_ZLIB:
+ return "zlib";
+ case CompressionKind_SNAPPY:
+ return "snappy";
+ case CompressionKind_LZO:
+ return "LZO";
+ }
+ std::stringstream buffer;
+ buffer << "unknown - " << kind;
+ return buffer.str();
+ }
+
+ std::string writerVersionToString(WriterVersion version) {
+ switch (version) {
+ case WriterVersion_ORIGINAL:
+ return "original";
+ case WriterVersion_HIVE_8732:
+ return "HIVE-8732";
+ case WriterVersion_HIVE_4243:
+ return "HIVE-4243";
+ }
+ std::stringstream buffer;
+ buffer << "future - " << version;
+ return buffer.str();
+ }
+
struct ReaderOptionsPrivate {
bool setIndexes;
bool setNames;
@@ -199,8 +229,12 @@ namespace orc {
return privateBits->serializedTail;
}
- StripeInformation::~StripeInformation() {
+ StreamInformation::~StreamInformation() {
+ // PASS
+ }
+ StripeInformation::~StripeInformation() {
+ // PASS
}
class ColumnStatisticsImpl: public ColumnStatistics {
@@ -746,28 +780,123 @@ namespace orc {
}
};
+ std::string streamKindToString(StreamKind kind) {
+ switch (kind) {
+ case StreamKind_PRESENT:
+ return "present";
+ case StreamKind_DATA:
+ return "data";
+ case StreamKind_LENGTH:
+ return "length";
+ case StreamKind_DICTIONARY_DATA:
+ return "dictionary";
+ case StreamKind_DICTIONARY_COUNT:
+ return "dictionary count";
+ case StreamKind_SECONDARY:
+ return "secondary";
+ case StreamKind_ROW_INDEX:
+ return "index";
+ case StreamKind_BLOOM_FILTER:
+ return "bloom";
+ }
+ std::stringstream buffer;
+ buffer << "unknown - " << kind;
+ return buffer.str();
+ }
+
+ std::string columnEncodingKindToString(ColumnEncodingKind kind) {
+ switch (kind) {
+ case ColumnEncodingKind_DIRECT:
+ return "direct";
+ case ColumnEncodingKind_DICTIONARY:
+ return "dictionary";
+ case ColumnEncodingKind_DIRECT_V2:
+ return "direct rle2";
+ case ColumnEncodingKind_DICTIONARY_V2:
+ return "dictionary rle2";
+ }
+ std::stringstream buffer;
+ buffer << "unknown - " << kind;
+ return buffer.str();
+ }
+
+ class StreamInformationImpl: public StreamInformation {
+ private:
+ StreamKind kind;
+ uint64_t column;
+ uint64_t offset;
+ uint64_t length;
+ public:
+ StreamInformationImpl(uint64_t _offset,
+ const proto::Stream& stream
+ ): kind(static_cast<StreamKind>(stream.kind())),
+ column(stream.column()),
+ offset(_offset),
+ length(stream.length()) {
+ // PASS
+ }
+
+ ~StreamInformationImpl();
+
+ StreamKind getKind() const override {
+ return kind;
+ }
+
+ uint64_t getColumnId() const override {
+ return column;
+ }
+
+ uint64_t getOffset() const override {
+ return offset;
+ }
+
+ uint64_t getLength() const override {
+ return length;
+ }
+ };
+
+ StreamInformationImpl::~StreamInformationImpl() {
+ // PASS
+ }
+
class StripeInformationImpl : public StripeInformation {
uint64_t offset;
uint64_t indexLength;
uint64_t dataLength;
uint64_t footerLength;
uint64_t numRows;
-
+ InputStream* stream;
+ MemoryPool& memory;
+ CompressionKind compression;
+ uint64_t blockSize;
+ mutable std::unique_ptr<proto::StripeFooter> stripeFooter;
+ void ensureStripeFooterLoaded() const;
public:
StripeInformationImpl(uint64_t _offset,
uint64_t _indexLength,
uint64_t _dataLength,
uint64_t _footerLength,
- uint64_t _numRows) :
- offset(_offset),
- indexLength(_indexLength),
- dataLength(_dataLength),
- footerLength(_footerLength),
- numRows(_numRows)
- {}
-
- virtual ~StripeInformationImpl();
+ uint64_t _numRows,
+ InputStream* _stream,
+ MemoryPool& _memory,
+ CompressionKind _compression,
+ uint64_t _blockSize
+ ) : offset(_offset),
+ indexLength(_indexLength),
+ dataLength(_dataLength),
+ footerLength(_footerLength),
+ numRows(_numRows),
+ stream(_stream),
+ memory(_memory),
+ compression(_compression),
+ blockSize(_blockSize) {
+ // PASS
+ }
+
+ virtual ~StripeInformationImpl() {
+ // PASS
+ }
uint64_t getOffset() const override {
return offset;
@@ -791,8 +920,68 @@ namespace orc {
uint64_t getNumberOfRows() const override {
return numRows;
}
+
+ uint64_t getNumberOfStreams() const override {
+ ensureStripeFooterLoaded();
+ return static_cast<uint64_t>(stripeFooter->streams_size());
+ }
+
+ std::unique_ptr<StreamInformation> getStreamInformation(uint64_t streamId
+ ) const override;
+
+ ColumnEncodingKind getColumnEncoding(uint64_t colId) const override {
+ ensureStripeFooterLoaded();
+ return static_cast<ColumnEncodingKind>(stripeFooter->
+ columns(static_cast<int>(colId))
+ .kind());
+ }
+
+ uint64_t getDictionarySize(uint64_t colId) const override {
+ ensureStripeFooterLoaded();
+ return static_cast<ColumnEncodingKind>(stripeFooter->
+ columns(static_cast<int>(colId))
+ .dictionarysize());
+ }
+
+ const std::string& getWriterTimezone() const override {
+ ensureStripeFooterLoaded();
+ return stripeFooter->writertimezone();
+ }
};
+ void StripeInformationImpl::ensureStripeFooterLoaded() const {
+ if (stripeFooter.get() == nullptr) {
+ std::unique_ptr<SeekableInputStream> pbStream =
+ createDecompressor(compression,
+ std::unique_ptr<SeekableInputStream>
+ (new SeekableFileInputStream(stream,
+ offset +
+ indexLength +
+ dataLength,
+ footerLength,
+ memory)),
+ blockSize,
+ memory);
+ stripeFooter.reset(new proto::StripeFooter());
+ if (!stripeFooter->ParseFromZeroCopyStream(pbStream.get())) {
+ throw ParseError("Failed to parse the stripe footer");
+ }
+ }
+ }
+
+ std::unique_ptr<StreamInformation>
+ StripeInformationImpl::getStreamInformation(uint64_t streamId) const {
+ ensureStripeFooterLoaded();
+ uint64_t streamOffset = offset;
+ for(uint64_t s=0; s < streamId; ++s) {
+ streamOffset += stripeFooter->streams(static_cast<int>(s)).length();
+ }
+ return ORC_UNIQUE_PTR<StreamInformation>
+ (new StreamInformationImpl(streamOffset,
+ stripeFooter->
+ streams(static_cast<int>(streamId))));
+ }
+
ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s,
bool correctStats) {
if (s.has_intstatistics()) {
@@ -869,10 +1058,6 @@ namespace orc {
// PASS
}
- StripeInformationImpl::~StripeInformationImpl() {
- // PASS
- }
-
static const uint64_t DIRECTORY_SIZE_GUESS = 16 * 1024;
class ReaderImpl : public Reader {
@@ -882,7 +1067,8 @@ namespace orc {
// inputs
std::unique_ptr<InputStream> stream;
ReaderOptions options;
- const uint64_t footerStart;
+ const uint64_t fileLength;
+ const uint64_t postscriptLength;
std::vector<bool> selectedColumns;
// custom memory pool
@@ -931,13 +1117,15 @@ namespace orc {
* @param options options for reading
* @param postscript the postscript for the file
* @param footer the footer for the file
- * @param footerStart the byte offset of the start of the footer
+ * @param fileLength the length of the file in bytes
+ * @param postscriptLength the length of the postscript in bytes
*/
ReaderImpl(std::unique_ptr<InputStream> stream,
const ReaderOptions& options,
std::unique_ptr<proto::PostScript> postscript,
std::unique_ptr<proto::Footer> footer,
- uint64_t footerStart);
+ uint64_t fileLength,
+ uint64_t postscriptLength);
const ReaderOptions& getReaderOptions() const;
@@ -945,6 +1133,8 @@ namespace orc {
std::string getFormatVersion() const override;
+ WriterVersion getWriterVersion() const override;
+
uint64_t getNumberOfRows() const override;
uint64_t getRowIndexStride() const override;
@@ -971,6 +1161,10 @@ namespace orc {
uint64_t getContentLength() const override;
+ uint64_t getStripeStatisticsLength() const override;
+ uint64_t getFileFooterLength() const override;
+ uint64_t getFilePostscriptLength() const override;
+ uint64_t getFileLength() const override;
std::unique_ptr<Statistics> getStatistics() const override;
@@ -1039,11 +1233,13 @@ namespace orc {
const ReaderOptions& opts,
std::unique_ptr<proto::PostScript> _postscript,
std::unique_ptr<proto::Footer> _footer,
- uint64_t _footerStart
+ uint64_t _fileLength,
+ uint64_t _postscriptLength
): epochOffset(getEpochOffset()),
stream(std::move(input)),
options(opts),
- footerStart(_footerStart),
+ fileLength(_fileLength),
+ postscriptLength(_postscriptLength),
memoryPool(*opts.getMemoryPool()),
postscript(std::move(_postscript)),
blockSize(getCompressionBlockSize(*postscript)),
@@ -1114,7 +1310,8 @@ namespace orc {
mutable_ps->CopyFrom(*postscript);
proto::Footer *mutableFooter = tail.mutable_footer();
mutableFooter->CopyFrom(*footer);
- tail.set_footerstart(footerStart);
+ tail.set_filelength(fileLength);
+ tail.set_postscriptlength(postscriptLength);
std::string result;
if (!tail.SerializeToString(&result)) {
throw ParseError("Failed to serialize file tail");
@@ -1160,7 +1357,11 @@ namespace orc {
stripeInfo.indexlength(),
stripeInfo.datalength(),
stripeInfo.footerlength(),
- stripeInfo.numberofrows()));
+ stripeInfo.numberofrows(),
+ stream.get(),
+ memoryPool,
+ compression,
+ blockSize));
}
std::string ReaderImpl::getFormatVersion() const {
@@ -1178,10 +1379,33 @@ namespace orc {
return footer->numberofrows();
}
+ WriterVersion ReaderImpl::getWriterVersion() const {
+ if (!postscript->has_writerversion()) {
+ return WriterVersion_ORIGINAL;
+ }
+ return static_cast<WriterVersion>(postscript->writerversion());
+ }
+
uint64_t ReaderImpl::getContentLength() const {
return footer->contentlength();
}
+ uint64_t ReaderImpl::getStripeStatisticsLength() const {
+ return postscript->metadatalength();
+ }
+
+ uint64_t ReaderImpl::getFileFooterLength() const {
+ return postscript->footerlength();
+ }
+
+ uint64_t ReaderImpl::getFilePostscriptLength() const {
+ return postscriptLength;
+ }
+
+ uint64_t ReaderImpl::getFileLength() const {
+ return fileLength;
+ }
+
uint64_t ReaderImpl::getRowIndexStride() const {
return footer->rowindexstride();
}
@@ -1255,7 +1479,8 @@ namespace orc {
void ReaderImpl::readMetadata() const {
uint64_t metadataSize = postscript->metadatalength();
- uint64_t metadataStart = footerStart - metadataSize;
+ uint64_t metadataStart = fileLength - metadataSize
+ - postscript->footerlength() - postscriptLength - 1;
if (metadataSize != 0) {
std::unique_ptr<SeekableInputStream> pbStream =
createDecompressor(compression,
@@ -1331,7 +1556,7 @@ namespace orc {
}
bool ReaderImpl::hasCorrectStatistics() const {
- return postscript->has_writerversion() && postscript->writerversion();
+ return getWriterVersion() != WriterVersion_ORIGINAL;
}
proto::StripeFooter ReaderImpl::getStripeFooter
@@ -1711,8 +1936,9 @@ namespace orc {
MemoryPool *memoryPool = options.getMemoryPool();
std::unique_ptr<proto::PostScript> ps;
std::unique_ptr<proto::Footer> footer;
- uint64_t footerStart;
std::string serializedFooter = options.getSerializedFileTail();
+ uint64_t fileLength;
+ uint64_t postscriptLength;
if (serializedFooter.length() != 0) {
// Parse the file tail from the serialized one.
proto::FileTail tail;
@@ -1721,30 +1947,30 @@ namespace orc {
}
ps.reset(new proto::PostScript(tail.postscript()));
footer.reset(new proto::Footer(tail.footer()));
- footerStart = tail.footerstart();
+ fileLength = tail.filelength();
+ postscriptLength = tail.postscriptlength();
} else {
// figure out the size of the file using the option or filesystem
- uint64_t size = std::min(options.getTailLocation(),
- static_cast<uint64_t>(stream->getLength()));
+ fileLength = std::min(options.getTailLocation(),
+ static_cast<uint64_t>(stream->getLength()));
//read last bytes into buffer to get PostScript
- uint64_t readSize = std::min(size, DIRECTORY_SIZE_GUESS);
+ uint64_t readSize = std::min(fileLength, DIRECTORY_SIZE_GUESS);
if (readSize < 4) {
throw ParseError("File size too small");
}
DataBuffer<char> *buffer = new DataBuffer<char>(*memoryPool, readSize);
- stream->read(buffer->data(), readSize, size - readSize);
+ stream->read(buffer->data(), readSize, fileLength - readSize);
- uint64_t postscriptSize = buffer->data()[readSize - 1] & 0xff;
- ps = readPostscript(stream.get(), buffer, postscriptSize);
+ postscriptLength = buffer->data()[readSize - 1] & 0xff;
+ ps = readPostscript(stream.get(), buffer, postscriptLength);
uint64_t footerSize = ps->footerlength();
- uint64_t tailSize = 1 + postscriptSize + footerSize;
- footerStart = size - tailSize;
+ uint64_t tailSize = 1 + postscriptLength + footerSize;
uint64_t footerOffset;
if (tailSize > readSize) {
buffer->resize(footerSize);
- stream->read(buffer->data(), footerSize, size - tailSize);
+ stream->read(buffer->data(), footerSize, fileLength - tailSize);
footerOffset = 0;
} else {
footerOffset = readSize - tailSize;
@@ -1758,7 +1984,8 @@ namespace orc {
options,
std::move(ps),
std::move(footer),
- footerStart));
+ fileLength,
+ postscriptLength));
}
ColumnStatistics::~ColumnStatistics() {
@@ -1952,7 +2179,7 @@ namespace orc {
_hasMinimum = false;
_hasMaximum = false;
_hasTotalLength = false;
-
+
totalLength = 0;
}else{
const proto::StringStatistics& stats = pb.stringstatistics();
http://git-wip-us.apache.org/repos/asf/orc/blob/aad3581b/proto/orc_proto.proto
----------------------------------------------------------------------
diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto
index 21b7f7c..502667f 100644
--- a/proto/orc_proto.proto
+++ b/proto/orc_proto.proto
@@ -222,5 +222,6 @@ message PostScript {
message FileTail {
optional PostScript postscript = 1;
optional Footer footer = 2;
- optional uint64 footerStart = 3;
+ optional uint64 fileLength = 3;
+ optional uint64 postscriptLength = 4;
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/orc/blob/aad3581b/tools/src/FileMetadata.cc
----------------------------------------------------------------------
diff --git a/tools/src/FileMetadata.cc b/tools/src/FileMetadata.cc
index 13db666..950e955 100644
--- a/tools/src/FileMetadata.cc
+++ b/tools/src/FileMetadata.cc
@@ -16,168 +16,161 @@
* limitations under the License.
*/
+#include <getopt.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <sstream>
-#include <iomanip>
-#include "wrap/orc-proto-wrapper.hh"
#include "orc/OrcFile.hh"
-using namespace orc::proto;
-
-uint64_t getTotalPaddingSize(const Footer& footer) {
- uint64_t paddedBytes = 0;
- StripeInformation stripe;
- for (int stripeIx=1; stripeIx<footer.stripes_size(); stripeIx++) {
- stripe = footer.stripes(stripeIx-1);
- uint64_t prevStripeOffset = stripe.offset();
- uint64_t prevStripeLen = stripe.datalength() + stripe.indexlength() +
- stripe.footerlength();
- paddedBytes += footer.stripes(stripeIx).offset() -
- (prevStripeOffset + prevStripeLen);
- };
- return paddedBytes;
+void printStripeInformation(std::ostream& out,
+ uint64_t index,
+ uint64_t columns,
+ std::unique_ptr<orc::StripeInformation> stripe,
+ bool verbose) {
+ out << " { \"stripe\": " << index
+ << ", \"rows\": " << stripe->getNumberOfRows() << ",\n";
+ out << " \"offset\": " << stripe->getOffset()
+ << ", \"length\": " << stripe->getLength() << ",\n";
+ out << " \"index\": " << stripe->getIndexLength()
+ << ", \"data\": " << stripe->getDataLength()
+ << ", \"footer\": " << stripe->getFooterLength();
+ if (verbose) {
+ out << ",\n \"encodings\": [\n";
+ for(uint64_t col=0; col < columns; ++col) {
+ if (col != 0) {
+ out << ",\n";
+ }
+ orc::ColumnEncodingKind encoding = stripe->getColumnEncoding(col);
+ out << " { \"column\": " << col
+ << ", \"encoding\": \""
+ << columnEncodingKindToString(encoding) << "\"";
+ if (encoding == orc::ColumnEncodingKind_DICTIONARY ||
+ encoding == orc::ColumnEncodingKind_DICTIONARY_V2) {
+ out << ", \"count\": " << stripe->getDictionarySize(col);
+ }
+ out << " }";
+ }
+ out << "\n ],\n";
+ out << " \"streams\": [\n";
+ for(uint64_t str = 0; str < stripe->getNumberOfStreams(); ++str) {
+ if (str != 0) {
+ out << ",\n";
+ }
+ ORC_UNIQUE_PTR<orc::StreamInformation> stream =
+ stripe->getStreamInformation(str);
+ out << " { \"id\": " << str
+ << ", \"column\": " << stream->getColumnId()
+ << ", \"kind\": \"" << streamKindToString(stream->getKind())
+ << "\", \"offset\": " << stream->getOffset()
+ << ", \"length\": " << stream->getLength() << " }";
+ }
+ out << "\n ]";
+ std::string tz = stripe->getWriterTimezone();
+ if (tz.length() != 0) {
+ out << ",\n \"timezone\": \"" << tz << "\"";
+ }
+ }
+ out << "\n }";
}
-void printMetadata(const char*filename) {
- std::streamsize origPrecision(std::cout.precision());
- std::ios::fmtflags origFlags(std::cout.flags());
- std::cout << "Structure for " << filename << std::endl;
- std::ifstream input;
-
- input.open(filename, std::ios::in | std::ios::binary);
- input.seekg(0,input.end);
- std::streamoff fileSize = input.tellg();
-
- // Read the postscript size
- input.seekg(fileSize-1);
- int result = input.get();
- if (result == EOF) {
- throw std::runtime_error("Failed to read postscript size");
+void printMetadata(std::ostream & out, const char*filename, bool verbose) {
+ std::unique_ptr<orc::Reader> reader =
+ orc::createReader(orc::readLocalFile(filename), orc::ReaderOptions());
+ out << "{ \"name\": \"" << filename << "\",\n";
+ uint64_t numberColumns = reader->getType().getMaximumColumnId() + 1;
+ out << " \"type\": \""
+ << reader->getType().toString() << "\",\n";
+ out << " \"rows\": " << reader->getNumberOfRows() << ",\n";
+ uint64_t stripeCount = reader->getNumberOfStripes();
+ out << " \"stripe count\": " << stripeCount << ",\n";
+ out << " \"format\": \"" << reader->getFormatVersion()
+ << "\", \"writer version\": \""
+ << orc::writerVersionToString(reader->getWriterVersion())
+ << "\",\n";
+ out << " \"compression\": \""
+ << orc::compressionKindToString(reader->getCompression())
+ << "\",";
+ if (reader->getCompression() != orc::CompressionKind_NONE) {
+ out << " \"compression block\": "
+ << reader->getCompressionSize() << ",";
}
- std::streamoff postscriptSize = result;
-
- // Read the postscript
- input.seekg(fileSize - postscriptSize-1);
- std::vector<char> buffer(static_cast<size_t>(postscriptSize));
- input.read(buffer.data(), postscriptSize);
- PostScript postscript ;
- postscript.ParseFromArray(buffer.data(),
- static_cast<int>(postscriptSize));
- std::cout << std::endl << " === Postscript === " << std::endl ;
- postscript.PrintDebugString();
-
- // Everything but the postscript is compressed
- switch (static_cast<int>(postscript.compression())) {
- case NONE:
- break;
- case ZLIB:
- case SNAPPY:
- case LZO:
- default:
- input.close();
- throw std::logic_error("ORC files with compression are not supported");
+ out << "\n \"file length\": " << reader->getFileLength() << ",\n";
+ out << " \"content\": " << reader->getContentLength()
+ << ", \"stripe stats\": " << reader->getStripeStatisticsLength()
+ << ", \"footer\": " << reader->getFileFooterLength()
+ << ", \"postscript\": " << reader->getFilePostscriptLength() << ",\n";
+ if (reader->getRowIndexStride()) {
+ out << " \"row index stride\": "
+ << reader->getRowIndexStride() << ",\n";
}
-
- std::streamoff footerSize =
- static_cast<std::streamoff>(postscript.footerlength());
- std::streamoff metadataSize =
- static_cast<std::streamoff>(postscript.metadatalength());
-
- // Read the metadata
- input.seekg(fileSize - 1 - postscriptSize - footerSize - metadataSize);
- buffer.resize(static_cast<size_t>(metadataSize));
- input.read(buffer.data(), metadataSize);
- Metadata metadata ;
- metadata.ParseFromArray(buffer.data(), static_cast<int>(metadataSize));
-
- // Read the footer
- //input.seekg(fileSize -1 - postscriptSize-footerSize);
- buffer.resize(static_cast<size_t>(footerSize));
- input.read(buffer.data(), footerSize);
- Footer footer ;
- footer.ParseFromArray(buffer.data(), static_cast<int>(footerSize));
- std::cout << std::endl << " === Footer === " << std::endl ;
- footer.PrintDebugString();
-
- std::cout << std::endl << "=== Stripe Statistics ===" << std::endl;
-
- StripeInformation stripe ;
- Stream section;
- ColumnEncoding encoding;
- for (int stripeIx=0; stripeIx<footer.stripes_size(); stripeIx++) {
- std::cout << "Stripe " << stripeIx+1 <<": " << std::endl ;
- stripe = footer.stripes(stripeIx);
- stripe.PrintDebugString();
-
- std::streamoff offset =
- static_cast<std::streamoff>(stripe.offset() + stripe.indexlength() +
- stripe.datalength());
- std::streamoff tailLength =
- static_cast<std::streamoff>(stripe.footerlength());
-
- // read the stripe footer
- input.seekg(offset);
- buffer.resize(static_cast<size_t>(tailLength));
- input.read(buffer.data(), tailLength);
-
- StripeFooter stripeFooter;
- stripeFooter.ParseFromArray(buffer.data(), static_cast<int>(tailLength));
- //stripeFooter.PrintDebugString();
- uint64_t stripeStart = stripe.offset();
- uint64_t sectionStart = stripeStart;
- for (int streamIx=0; streamIx<stripeFooter.streams_size(); streamIx++) {
- section = stripeFooter.streams(streamIx);
- std::cout << " Stream: column " << section.column()
- << " section "
- << section.kind() << " start: " << sectionStart
- << " length " << section.length() << std::endl;
- sectionStart += section.length();
- };
- for (int columnIx=0; columnIx<stripeFooter.columns_size();
- columnIx++) {
- encoding = stripeFooter.columns(columnIx);
- std::cout << " Encoding column " << columnIx << ": "
- << encoding.kind() ;
- if (encoding.kind() == ColumnEncoding_Kind_DICTIONARY ||
- encoding.kind() == ColumnEncoding_Kind_DICTIONARY_V2)
- std::cout << "[" << encoding.dictionarysize() << "]";
- std::cout << std::endl;
- };
- };
-
- uint64_t paddedBytes = getTotalPaddingSize(footer);
- // empty ORC file is ~45 bytes. Assumption here is file length always >0
- double percentPadding =
- static_cast<double>(paddedBytes) * 100 / static_cast<double>(fileSize);
- std::cout << "File length: " << fileSize << " bytes" << std::endl;
- std::cout <<"Padding length: " << paddedBytes << " bytes" << std::endl;
- std::cout <<"Padding ratio: " << std::fixed << std::setprecision(2)
- << percentPadding << " %" << std::endl;
- std::cout.precision(origPrecision);
- std::cout.flags(origFlags);
- input.close();
+ out << " \"user metadata\": {";
+ std::list<std::string> keys = reader->getMetadataKeys();
+ uint64_t remaining = keys.size();
+ for(std::list<std::string>::const_iterator itr = keys.begin();
+ itr != keys.end(); ++itr) {
+ out << "\n \"" << *itr << "\": \""
+ << reader->getMetadataValue(*itr) << "\"";
+ if (--remaining != 0) {
+ out << ",";
+ }
+ }
+ out << "\n },\n";
+ out << " \"stripes\": [\n";
+ for(uint64_t i=0; i < stripeCount; ++i) {
+ printStripeInformation(out, i, numberColumns, reader->getStripe(i),
+ verbose);
+ if (i == stripeCount - 1) {
+ out << "\n";
+ } else {
+ out << ",\n";
+ }
+ }
+ out << " ]\n";
+ out << "}\n";
}
int main(int argc, char* argv[]) {
- GOOGLE_PROTOBUF_VERIFY_VERSION;
-
- if (argc < 2) {
- std::cout << "Usage: file-metadata <filename>\n";
- }
- try {
- printMetadata(argv[1]);
- } catch (std::exception& ex) {
- std::cerr << "Caught exception: " << ex.what() << "\n";
- return 1;
+ static struct option longOptions[] = {
+ {"help", no_argument, nullptr, 'h'},
+ {"verbose", no_argument, nullptr, 'v'},
+ {nullptr, 0, nullptr, 0}
+ };
+ bool helpFlag = false;
+ bool verboseFlag = false;
+ int opt;
+ do {
+ opt = getopt_long(argc, argv, "hv", longOptions, nullptr);
+ switch (opt) {
+ case '?':
+ case 'h':
+ helpFlag = true;
+ opt = -1;
+ break;
+ case 'v':
+ verboseFlag = true;
+ break;
+ }
+ } while (opt != -1);
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1 || helpFlag) {
+ std::cerr
+ << "Usage: file-metadata [-h] [--help] [-v] [--verbose] <filename>\n";
+ exit(1);
+ } else {
+ for(int i=0; i < argc; ++i) {
+ try {
+ printMetadata(std::cout, argv[i], verboseFlag);
+ } catch (std::exception& ex) {
+ std::cerr << "Caught exception in " << argv[i]
+ << ": " << ex.what() << "\n";
+ return 1;
+ }
+ }
}
-
- google::protobuf::ShutdownProtobufLibrary();
-
return 0;
}
-
-