You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2016/01/19 22:12:22 UTC

orc git commit: ORC-8. Reimplement file-metadata to use the reader API. (omalley reviewed by asandryh)

Repository: orc
Updated Branches:
  refs/heads/master 6fa860f2e -> aad3581bc


ORC-8. Reimplement file-metadata to use the reader API. (omalley reviewed by
asandryh)

Signed-off-by: Owen O'Malley <om...@apache.org>

Fixes apache/orc#15


Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/aad3581b
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/aad3581b
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/aad3581b

Branch: refs/heads/master
Commit: aad3581bc9f66d8f16eacaf87949f743e548f20d
Parents: 6fa860f
Author: Owen O'Malley <om...@apache.org>
Authored: Fri Jan 15 12:48:41 2016 -0800
Committer: Owen O'Malley <om...@apache.org>
Committed: Tue Jan 19 13:11:34 2016 -0800

----------------------------------------------------------------------
 c++/include/orc/Reader.hh | 114 +++++++++++++++-
 c++/src/Reader.cc         | 301 ++++++++++++++++++++++++++++++++++++-----
 proto/orc_proto.proto     |   3 +-
 tools/src/FileMetadata.cc | 287 +++++++++++++++++++--------------------
 4 files changed, 518 insertions(+), 187 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/orc/blob/aad3581b/c++/include/orc/Reader.hh
----------------------------------------------------------------------
diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh
index d924fbf..b6c5480 100644
--- a/c++/include/orc/Reader.hh
+++ b/c++/include/orc/Reader.hh
@@ -40,6 +40,22 @@ namespace orc {
   };
 
   /**
+   * Get the name of the CompressionKind.
+   */
+  std::string compressionKindToString(CompressionKind kind);
+
+  enum WriterVersion {
+    WriterVersion_ORIGINAL = 0,
+    WriterVersion_HIVE_8732 = 1,
+    WriterVersion_HIVE_4243 = 2
+  };
+
+  /**
+   * Get the name of the WriterVersion.
+   */
+  std::string writerVersionToString(WriterVersion kind);
+
+  /**
    * Statistics that are available for all types of columns.
    */
   class ColumnStatistics {
@@ -337,6 +353,41 @@ namespace orc {
     virtual int64_t getMaximum() const = 0;
   };
 
+  enum StreamKind {
+    StreamKind_PRESENT = 0,
+    StreamKind_DATA = 1,
+    StreamKind_LENGTH = 2,
+    StreamKind_DICTIONARY_DATA = 3,
+    StreamKind_DICTIONARY_COUNT = 4,
+    StreamKind_SECONDARY = 5,
+    StreamKind_ROW_INDEX = 6,
+    StreamKind_BLOOM_FILTER = 7
+  };
+
+  /**
+   * Get the string representation of the StreamKind.
+   */
+  std::string streamKindToString(StreamKind kind);
+
+  class StreamInformation {
+  public:
+    virtual ~StreamInformation();
+
+    virtual StreamKind getKind() const = 0;
+    virtual uint64_t getColumnId() const = 0;
+    virtual uint64_t getOffset() const = 0;
+    virtual uint64_t getLength() const = 0;
+  };
+
+  enum ColumnEncodingKind {
+    ColumnEncodingKind_DIRECT = 0,
+    ColumnEncodingKind_DICTIONARY = 1,
+    ColumnEncodingKind_DIRECT_V2 = 2,
+    ColumnEncodingKind_DICTIONARY_V2 = 3
+  };
+
+  std::string columnEncodingKindToString(ColumnEncodingKind kind);
+
   class StripeInformation {
   public:
     virtual ~StripeInformation();
@@ -376,6 +427,35 @@ namespace orc {
      * @return a count of the number of rows
      */
     virtual uint64_t getNumberOfRows() const = 0;
+
+    /**
+     * Get the number of streams in the stripe.
+     */
+    virtual uint64_t getNumberOfStreams() const = 0;
+
+    /**
+     * Get the StreamInformation for the given stream.
+     */
+    virtual ORC_UNIQUE_PTR<StreamInformation>
+      getStreamInformation(uint64_t streamId) const = 0;
+
+    /**
+     * Get the column encoding for the given column.
+     * @param colId the columnId
+     */
+    virtual ColumnEncodingKind getColumnEncoding(uint64_t colId) const = 0;
+
+    /**
+     * Get the dictionary size.
+     * @param colId the columnId
+     * @return the size of the dictionary or 0 if there isn't one
+     */
+    virtual uint64_t getDictionarySize(uint64_t colId) const = 0;
+
+    /**
+     * Get the writer timezone.
+     */
+    virtual const std::string& getWriterTimezone() const = 0;
   };
 
   class Statistics {
@@ -616,6 +696,12 @@ namespace orc {
     virtual uint64_t getCompressionSize() const = 0;
 
     /**
+     * Get the version of the writer.
+     * @return the version of the writer.
+     */
+    virtual WriterVersion getWriterVersion() const = 0;
+
+    /**
      * Get the number of rows per a entry in the row index.
      * @return the number of rows per an entry in the row index or 0 if there
      * is no row index.
@@ -651,12 +737,36 @@ namespace orc {
     getStripeStatistics(uint64_t stripeIndex) const = 0;
 
     /**
-     * Get the length of the file.
-     * @return the number of bytes in the file
+     * Get the length of the data stripes in the file.
+     * @return the number of bytes in stripes
      */
     virtual uint64_t getContentLength() const = 0;
 
     /**
+     * Get the length of the file stripe statistics
+     * @return the number of compressed bytes in the file stripe statistics
+     */
+    virtual uint64_t getStripeStatisticsLength() const = 0;
+
+    /**
+     * Get the length of the file footer
+     * @return the number of compressed bytes in the file footer
+     */
+    virtual uint64_t getFileFooterLength() const = 0;
+
+    /**
+     * Get the length of the file postscript
+     * @return the number of bytes in the file postscript
+     */
+    virtual uint64_t getFilePostscriptLength() const = 0;
+
+    /**
+     * Get the total length of the file.
+     * @return the number of bytes in the file
+     */
+    virtual uint64_t getFileLength() const = 0;
+
+    /**
      * Get the statistics about the columns in the file.
      * @return the information about the column
      */

http://git-wip-us.apache.org/repos/asf/orc/blob/aad3581b/c++/src/Reader.cc
----------------------------------------------------------------------
diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc
index 940ef16..29bd439 100644
--- a/c++/src/Reader.cc
+++ b/c++/src/Reader.cc
@@ -38,6 +38,36 @@
 
 namespace orc {
 
+  std::string compressionKindToString(CompressionKind kind) {
+    switch (kind) {
+    case CompressionKind_NONE:
+      return "none";
+    case CompressionKind_ZLIB:
+      return "zlib";
+    case CompressionKind_SNAPPY:
+      return "snappy";
+    case CompressionKind_LZO:
+      return "LZO";
+    }
+    std::stringstream buffer;
+    buffer << "unknown - " << kind;
+    return buffer.str();
+  }
+
+  std::string writerVersionToString(WriterVersion version) {
+    switch (version) {
+    case WriterVersion_ORIGINAL:
+      return "original";
+    case WriterVersion_HIVE_8732:
+      return "HIVE-8732";
+    case WriterVersion_HIVE_4243:
+      return "HIVE-4243";
+    }
+    std::stringstream buffer;
+    buffer << "future - " << version;
+    return buffer.str();
+  }
+
   struct ReaderOptionsPrivate {
     bool setIndexes;
     bool setNames;
@@ -199,8 +229,12 @@ namespace orc {
     return privateBits->serializedTail;
   }
 
-  StripeInformation::~StripeInformation() {
+  StreamInformation::~StreamInformation() {
+    // PASS
+  }
 
+  StripeInformation::~StripeInformation() {
+    // PASS
   }
 
   class ColumnStatisticsImpl: public ColumnStatistics {
@@ -746,28 +780,123 @@ namespace orc {
     }
   };
 
+  std::string streamKindToString(StreamKind kind) {
+    switch (kind) {
+    case StreamKind_PRESENT:
+      return "present";
+    case StreamKind_DATA:
+      return "data";
+    case StreamKind_LENGTH:
+      return "length";
+    case StreamKind_DICTIONARY_DATA:
+      return "dictionary";
+    case StreamKind_DICTIONARY_COUNT:
+      return "dictionary count";
+    case StreamKind_SECONDARY:
+      return "secondary";
+    case StreamKind_ROW_INDEX:
+      return "index";
+    case StreamKind_BLOOM_FILTER:
+      return "bloom";
+    }
+    std::stringstream buffer;
+    buffer << "unknown - " << kind;
+    return buffer.str();
+  }
+
+  std::string columnEncodingKindToString(ColumnEncodingKind kind) {
+    switch (kind) {
+    case ColumnEncodingKind_DIRECT:
+      return "direct";
+    case ColumnEncodingKind_DICTIONARY:
+      return "dictionary";
+    case ColumnEncodingKind_DIRECT_V2:
+      return "direct rle2";
+    case ColumnEncodingKind_DICTIONARY_V2:
+      return "dictionary rle2";
+    }
+    std::stringstream buffer;
+    buffer << "unknown - " << kind;
+    return buffer.str();
+  }
+
+  class StreamInformationImpl: public StreamInformation {
+  private:
+    StreamKind kind;
+    uint64_t column;
+    uint64_t offset;
+    uint64_t length;
+  public:
+    StreamInformationImpl(uint64_t _offset,
+                          const proto::Stream& stream
+                          ): kind(static_cast<StreamKind>(stream.kind())),
+                             column(stream.column()),
+                             offset(_offset),
+                             length(stream.length()) {
+      // PASS
+    }
+
+    ~StreamInformationImpl();
+
+    StreamKind getKind() const override {
+      return kind;
+    }
+
+    uint64_t getColumnId() const override {
+      return column;
+    }
+
+    uint64_t getOffset() const override {
+      return offset;
+    }
+
+    uint64_t getLength() const override {
+      return length;
+    }
+  };
+
+  StreamInformationImpl::~StreamInformationImpl() {
+    // PASS
+  }
+
   class StripeInformationImpl : public StripeInformation {
     uint64_t offset;
     uint64_t indexLength;
     uint64_t dataLength;
     uint64_t footerLength;
     uint64_t numRows;
-
+    InputStream* stream;
+    MemoryPool& memory;
+    CompressionKind compression;
+    uint64_t blockSize;
+    mutable std::unique_ptr<proto::StripeFooter> stripeFooter;
+    void ensureStripeFooterLoaded() const;
   public:
 
     StripeInformationImpl(uint64_t _offset,
                           uint64_t _indexLength,
                           uint64_t _dataLength,
                           uint64_t _footerLength,
-                          uint64_t _numRows) :
-      offset(_offset),
-      indexLength(_indexLength),
-      dataLength(_dataLength),
-      footerLength(_footerLength),
-      numRows(_numRows)
-    {}
-
-    virtual ~StripeInformationImpl();
+                          uint64_t _numRows,
+                          InputStream* _stream,
+                          MemoryPool& _memory,
+                          CompressionKind _compression,
+                          uint64_t _blockSize
+                          ) : offset(_offset),
+                              indexLength(_indexLength),
+                              dataLength(_dataLength),
+                              footerLength(_footerLength),
+                              numRows(_numRows),
+                              stream(_stream),
+                              memory(_memory),
+                              compression(_compression),
+                              blockSize(_blockSize) {
+      // PASS
+    }
+
+    virtual ~StripeInformationImpl() {
+      // PASS
+    }
 
     uint64_t getOffset() const override {
       return offset;
@@ -791,8 +920,68 @@ namespace orc {
     uint64_t getNumberOfRows() const override {
       return numRows;
     }
+
+    uint64_t getNumberOfStreams() const override {
+      ensureStripeFooterLoaded();
+      return static_cast<uint64_t>(stripeFooter->streams_size());
+    }
+
+    std::unique_ptr<StreamInformation> getStreamInformation(uint64_t streamId
+                                                            ) const override;
+
+    ColumnEncodingKind getColumnEncoding(uint64_t colId) const override {
+      ensureStripeFooterLoaded();
+      return static_cast<ColumnEncodingKind>(stripeFooter->
+                                             columns(static_cast<int>(colId))
+                                             .kind());
+    }
+
+    uint64_t getDictionarySize(uint64_t colId) const override {
+      ensureStripeFooterLoaded();
+      return static_cast<ColumnEncodingKind>(stripeFooter->
+                                             columns(static_cast<int>(colId))
+                                             .dictionarysize());
+    }
+
+    const std::string& getWriterTimezone() const override {
+      ensureStripeFooterLoaded();
+      return stripeFooter->writertimezone();
+    }
   };
 
+  void StripeInformationImpl::ensureStripeFooterLoaded() const {
+    if (stripeFooter.get() == nullptr) {
+      std::unique_ptr<SeekableInputStream> pbStream =
+        createDecompressor(compression,
+                           std::unique_ptr<SeekableInputStream>
+                             (new SeekableFileInputStream(stream,
+                                                          offset +
+                                                            indexLength +
+                                                            dataLength,
+                                                          footerLength,
+                                                          memory)),
+                           blockSize,
+                           memory);
+      stripeFooter.reset(new proto::StripeFooter());
+      if (!stripeFooter->ParseFromZeroCopyStream(pbStream.get())) {
+        throw ParseError("Failed to parse the stripe footer");
+      }
+    }
+  }
+
+  std::unique_ptr<StreamInformation>
+     StripeInformationImpl::getStreamInformation(uint64_t streamId) const {
+    ensureStripeFooterLoaded();
+    uint64_t streamOffset = offset;
+    for(uint64_t s=0; s < streamId; ++s) {
+      streamOffset += stripeFooter->streams(static_cast<int>(s)).length();
+    }
+    return ORC_UNIQUE_PTR<StreamInformation>
+      (new StreamInformationImpl(streamOffset,
+                                 stripeFooter->
+                                   streams(static_cast<int>(streamId))));
+  }
+
   ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s,
                                             bool correctStats) {
     if (s.has_intstatistics()) {
@@ -869,10 +1058,6 @@ namespace orc {
     // PASS
   }
 
-  StripeInformationImpl::~StripeInformationImpl() {
-    // PASS
-  }
-
   static const uint64_t DIRECTORY_SIZE_GUESS = 16 * 1024;
 
   class ReaderImpl : public Reader {
@@ -882,7 +1067,8 @@ namespace orc {
     // inputs
     std::unique_ptr<InputStream> stream;
     ReaderOptions options;
-    const uint64_t footerStart;
+    const uint64_t fileLength;
+    const uint64_t postscriptLength;
     std::vector<bool> selectedColumns;
 
     // custom memory pool
@@ -931,13 +1117,15 @@ namespace orc {
      * @param options options for reading
      * @param postscript the postscript for the file
      * @param footer the footer for the file
-     * @param footerStart the byte offset of the start of the footer
+     * @param fileLength the length of the file in bytes
+     * @param postscriptLength the length of the postscript in bytes
      */
     ReaderImpl(std::unique_ptr<InputStream> stream,
                const ReaderOptions& options,
                std::unique_ptr<proto::PostScript> postscript,
                std::unique_ptr<proto::Footer> footer,
-               uint64_t footerStart);
+               uint64_t fileLength,
+               uint64_t postscriptLength);
 
     const ReaderOptions& getReaderOptions() const;
 
@@ -945,6 +1133,8 @@ namespace orc {
 
     std::string getFormatVersion() const override;
 
+    WriterVersion getWriterVersion() const override;
+
     uint64_t getNumberOfRows() const override;
 
     uint64_t getRowIndexStride() const override;
@@ -971,6 +1161,10 @@ namespace orc {
 
 
     uint64_t getContentLength() const override;
+    uint64_t getStripeStatisticsLength() const override;
+    uint64_t getFileFooterLength() const override;
+    uint64_t getFilePostscriptLength() const override;
+    uint64_t getFileLength() const override;
 
     std::unique_ptr<Statistics> getStatistics() const override;
 
@@ -1039,11 +1233,13 @@ namespace orc {
                          const ReaderOptions& opts,
                          std::unique_ptr<proto::PostScript> _postscript,
                          std::unique_ptr<proto::Footer> _footer,
-                         uint64_t _footerStart
+                         uint64_t _fileLength,
+                         uint64_t _postscriptLength
                          ): epochOffset(getEpochOffset()),
                             stream(std::move(input)),
                             options(opts),
-                            footerStart(_footerStart),
+                            fileLength(_fileLength),
+                            postscriptLength(_postscriptLength),
                             memoryPool(*opts.getMemoryPool()),
                             postscript(std::move(_postscript)),
                             blockSize(getCompressionBlockSize(*postscript)),
@@ -1114,7 +1310,8 @@ namespace orc {
     mutable_ps->CopyFrom(*postscript);
     proto::Footer *mutableFooter = tail.mutable_footer();
     mutableFooter->CopyFrom(*footer);
-    tail.set_footerstart(footerStart);
+    tail.set_filelength(fileLength);
+    tail.set_postscriptlength(postscriptLength);
     std::string result;
     if (!tail.SerializeToString(&result)) {
       throw ParseError("Failed to serialize file tail");
@@ -1160,7 +1357,11 @@ namespace orc {
         stripeInfo.indexlength(),
         stripeInfo.datalength(),
         stripeInfo.footerlength(),
-        stripeInfo.numberofrows()));
+        stripeInfo.numberofrows(),
+        stream.get(),
+        memoryPool,
+        compression,
+        blockSize));
   }
 
   std::string ReaderImpl::getFormatVersion() const {
@@ -1178,10 +1379,33 @@ namespace orc {
     return footer->numberofrows();
   }
 
+  WriterVersion ReaderImpl::getWriterVersion() const {
+    if (!postscript->has_writerversion()) {
+      return WriterVersion_ORIGINAL;
+    }
+    return static_cast<WriterVersion>(postscript->writerversion());
+  }
+
   uint64_t ReaderImpl::getContentLength() const {
     return footer->contentlength();
   }
 
+  uint64_t ReaderImpl::getStripeStatisticsLength() const {
+    return postscript->metadatalength();
+  }
+
+  uint64_t ReaderImpl::getFileFooterLength() const {
+    return postscript->footerlength();
+  }
+
+  uint64_t ReaderImpl::getFilePostscriptLength() const {
+    return postscriptLength;
+  }
+
+  uint64_t ReaderImpl::getFileLength() const {
+    return fileLength;
+  }
+
   uint64_t ReaderImpl::getRowIndexStride() const {
     return footer->rowindexstride();
   }
@@ -1255,7 +1479,8 @@ namespace orc {
 
   void ReaderImpl::readMetadata() const {
     uint64_t metadataSize = postscript->metadatalength();
-    uint64_t metadataStart = footerStart - metadataSize;
+    uint64_t metadataStart = fileLength - metadataSize
+      - postscript->footerlength() - postscriptLength - 1;
     if (metadataSize != 0) {
       std::unique_ptr<SeekableInputStream> pbStream =
         createDecompressor(compression,
@@ -1331,7 +1556,7 @@ namespace orc {
   }
 
   bool ReaderImpl::hasCorrectStatistics() const {
-    return postscript->has_writerversion() && postscript->writerversion();
+    return getWriterVersion() != WriterVersion_ORIGINAL;
   }
 
   proto::StripeFooter ReaderImpl::getStripeFooter
@@ -1711,8 +1936,9 @@ namespace orc {
     MemoryPool *memoryPool = options.getMemoryPool();
     std::unique_ptr<proto::PostScript> ps;
     std::unique_ptr<proto::Footer> footer;
-    uint64_t footerStart;
     std::string serializedFooter = options.getSerializedFileTail();
+    uint64_t fileLength;
+    uint64_t postscriptLength;
     if (serializedFooter.length() != 0) {
       // Parse the file tail from the serialized one.
       proto::FileTail tail;
@@ -1721,30 +1947,30 @@ namespace orc {
       }
       ps.reset(new proto::PostScript(tail.postscript()));
       footer.reset(new proto::Footer(tail.footer()));
-      footerStart = tail.footerstart();
+      fileLength = tail.filelength();
+      postscriptLength = tail.postscriptlength();
     } else {
       // figure out the size of the file using the option or filesystem
-      uint64_t size = std::min(options.getTailLocation(),
-                               static_cast<uint64_t>(stream->getLength()));
+      fileLength = std::min(options.getTailLocation(),
+                            static_cast<uint64_t>(stream->getLength()));
 
       //read last bytes into buffer to get PostScript
-      uint64_t readSize = std::min(size, DIRECTORY_SIZE_GUESS);
+      uint64_t readSize = std::min(fileLength, DIRECTORY_SIZE_GUESS);
       if (readSize < 4) {
         throw ParseError("File size too small");
       }
       DataBuffer<char> *buffer = new DataBuffer<char>(*memoryPool, readSize);
-      stream->read(buffer->data(), readSize, size - readSize);
+      stream->read(buffer->data(), readSize, fileLength - readSize);
 
-      uint64_t postscriptSize = buffer->data()[readSize - 1] & 0xff;
-      ps = readPostscript(stream.get(), buffer, postscriptSize);
+      postscriptLength = buffer->data()[readSize - 1] & 0xff;
+      ps = readPostscript(stream.get(), buffer, postscriptLength);
       uint64_t footerSize = ps->footerlength();
-      uint64_t tailSize = 1 + postscriptSize + footerSize;
-      footerStart = size - tailSize;
+      uint64_t tailSize = 1 + postscriptLength + footerSize;
       uint64_t footerOffset;
 
       if (tailSize > readSize) {
         buffer->resize(footerSize);
-        stream->read(buffer->data(), footerSize, size - tailSize);
+        stream->read(buffer->data(), footerSize, fileLength - tailSize);
         footerOffset = 0;
       } else {
         footerOffset = readSize - tailSize;
@@ -1758,7 +1984,8 @@ namespace orc {
                                                   options,
                                                   std::move(ps),
                                                   std::move(footer),
-                                                  footerStart));
+                                                  fileLength,
+                                                  postscriptLength));
   }
 
   ColumnStatistics::~ColumnStatistics() {
@@ -1952,7 +2179,7 @@ namespace orc {
       _hasMinimum = false;
       _hasMaximum = false;
       _hasTotalLength = false;
-      
+
       totalLength = 0;
     }else{
       const proto::StringStatistics& stats = pb.stringstatistics();

http://git-wip-us.apache.org/repos/asf/orc/blob/aad3581b/proto/orc_proto.proto
----------------------------------------------------------------------
diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto
index 21b7f7c..502667f 100644
--- a/proto/orc_proto.proto
+++ b/proto/orc_proto.proto
@@ -222,5 +222,6 @@ message PostScript {
 message FileTail {
   optional PostScript postscript = 1;
   optional Footer footer = 2;
-  optional uint64 footerStart = 3;
+  optional uint64 fileLength = 3;
+  optional uint64 postscriptLength = 4;
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/orc/blob/aad3581b/tools/src/FileMetadata.cc
----------------------------------------------------------------------
diff --git a/tools/src/FileMetadata.cc b/tools/src/FileMetadata.cc
index 13db666..950e955 100644
--- a/tools/src/FileMetadata.cc
+++ b/tools/src/FileMetadata.cc
@@ -16,168 +16,161 @@
  * limitations under the License.
  */
 
+#include <getopt.h>
 #include <iostream>
 #include <fstream>
 #include <vector>
 #include <string>
 #include <sstream>
-#include <iomanip>
 
-#include "wrap/orc-proto-wrapper.hh"
 #include "orc/OrcFile.hh"
 
-using namespace orc::proto;
-
-uint64_t getTotalPaddingSize(const Footer& footer) {
-  uint64_t paddedBytes = 0;
-  StripeInformation stripe;
-  for (int stripeIx=1; stripeIx<footer.stripes_size(); stripeIx++) {
-      stripe = footer.stripes(stripeIx-1);
-      uint64_t prevStripeOffset = stripe.offset();
-      uint64_t prevStripeLen = stripe.datalength() + stripe.indexlength() +
-        stripe.footerlength();
-      paddedBytes += footer.stripes(stripeIx).offset() -
-        (prevStripeOffset + prevStripeLen);
-  };
-  return paddedBytes;
+void printStripeInformation(std::ostream& out,
+                            uint64_t index,
+                            uint64_t columns,
+                            std::unique_ptr<orc::StripeInformation> stripe,
+                            bool verbose) {
+  out << "    { \"stripe\": " << index
+      << ", \"rows\": " << stripe->getNumberOfRows() << ",\n";
+  out << "      \"offset\": " << stripe->getOffset()
+      << ", \"length\": " << stripe->getLength() << ",\n";
+  out << "      \"index\": " << stripe->getIndexLength()
+      << ", \"data\": " << stripe->getDataLength()
+      << ", \"footer\": " << stripe->getFooterLength();
+  if (verbose) {
+    out << ",\n      \"encodings\": [\n";
+    for(uint64_t col=0; col < columns; ++col) {
+      if (col != 0) {
+        out << ",\n";
+      }
+      orc::ColumnEncodingKind encoding = stripe->getColumnEncoding(col);
+      out << "         { \"column\": " << col
+          << ", \"encoding\": \""
+          << columnEncodingKindToString(encoding) << "\"";
+      if (encoding == orc::ColumnEncodingKind_DICTIONARY ||
+          encoding == orc::ColumnEncodingKind_DICTIONARY_V2) {
+        out << ", \"count\": " << stripe->getDictionarySize(col);
+      }
+      out << " }";
+    }
+    out << "\n      ],\n";
+    out << "      \"streams\": [\n";
+    for(uint64_t str = 0; str < stripe->getNumberOfStreams(); ++str) {
+      if (str != 0) {
+        out << ",\n";
+      }
+      ORC_UNIQUE_PTR<orc::StreamInformation> stream =
+        stripe->getStreamInformation(str);
+      out << "        { \"id\": " << str
+          << ", \"column\": " << stream->getColumnId()
+          << ", \"kind\": \"" << streamKindToString(stream->getKind())
+          << "\", \"offset\": " << stream->getOffset()
+          << ", \"length\": " << stream->getLength() << " }";
+    }
+    out << "\n      ]";
+    std::string tz = stripe->getWriterTimezone();
+    if (tz.length() != 0) {
+      out << ",\n      \"timezone\": \"" << tz << "\"";
+    }
+  }
+  out << "\n    }";
 }
 
-void printMetadata(const char*filename) {
-  std::streamsize origPrecision(std::cout.precision());
-  std::ios::fmtflags origFlags(std::cout.flags());
-  std::cout << "Structure for " << filename << std::endl;
-  std::ifstream input;
-
-  input.open(filename, std::ios::in | std::ios::binary);
-  input.seekg(0,input.end);
-  std::streamoff fileSize = input.tellg();
-
-  // Read the postscript size
-  input.seekg(fileSize-1);
-  int result = input.get();
-  if (result == EOF) {
-    throw std::runtime_error("Failed to read postscript size");
+void printMetadata(std::ostream & out, const char*filename, bool verbose) {
+  std::unique_ptr<orc::Reader> reader =
+    orc::createReader(orc::readLocalFile(filename), orc::ReaderOptions());
+  out << "{ \"name\": \"" << filename << "\",\n";
+  uint64_t numberColumns = reader->getType().getMaximumColumnId() + 1;
+  out << "  \"type\": \""
+            << reader->getType().toString() << "\",\n";
+  out << "  \"rows\": " << reader->getNumberOfRows() << ",\n";
+  uint64_t stripeCount = reader->getNumberOfStripes();
+  out << "  \"stripe count\": " << stripeCount << ",\n";
+  out << "  \"format\": \"" << reader->getFormatVersion()
+      << "\", \"writer version\": \""
+            << orc::writerVersionToString(reader->getWriterVersion())
+            << "\",\n";
+  out << "  \"compression\": \""
+            << orc::compressionKindToString(reader->getCompression())
+            << "\",";
+  if (reader->getCompression() != orc::CompressionKind_NONE) {
+    out << " \"compression block\": "
+              << reader->getCompressionSize() << ",";
   }
-  std::streamoff postscriptSize = result;
-
-  // Read the postscript
-  input.seekg(fileSize - postscriptSize-1);
-  std::vector<char> buffer(static_cast<size_t>(postscriptSize));
-  input.read(buffer.data(), postscriptSize);
-  PostScript postscript ;
-  postscript.ParseFromArray(buffer.data(),
-                            static_cast<int>(postscriptSize));
-  std::cout << std::endl << " === Postscript === " << std::endl ;
-  postscript.PrintDebugString();
-
-  // Everything but the postscript is compressed
-  switch (static_cast<int>(postscript.compression())) {
-  case NONE:
-      break;
-  case ZLIB:
-  case SNAPPY:
-  case LZO:
-  default:
-    input.close();
-    throw std::logic_error("ORC files with compression are not supported");
+  out << "\n  \"file length\": " << reader->getFileLength() << ",\n";
+  out << "  \"content\": " << reader->getContentLength()
+      << ", \"stripe stats\": " << reader->getStripeStatisticsLength()
+      << ", \"footer\": " << reader->getFileFooterLength()
+      << ", \"postscript\": " << reader->getFilePostscriptLength() << ",\n";
+  if (reader->getRowIndexStride()) {
+    out << "  \"row index stride\": "
+              << reader->getRowIndexStride() << ",\n";
   }
-
-  std::streamoff footerSize =
-    static_cast<std::streamoff>(postscript.footerlength());
-  std::streamoff metadataSize =
-    static_cast<std::streamoff>(postscript.metadatalength());
-
-  // Read the metadata
-  input.seekg(fileSize - 1 - postscriptSize - footerSize - metadataSize);
-  buffer.resize(static_cast<size_t>(metadataSize));
-  input.read(buffer.data(), metadataSize);
-  Metadata metadata ;
-  metadata.ParseFromArray(buffer.data(), static_cast<int>(metadataSize));
-
-  // Read the footer
-  //input.seekg(fileSize -1 - postscriptSize-footerSize);
-  buffer.resize(static_cast<size_t>(footerSize));
-  input.read(buffer.data(), footerSize);
-  Footer footer ;
-  footer.ParseFromArray(buffer.data(), static_cast<int>(footerSize));
-  std::cout << std::endl << " === Footer === " << std::endl ;
-  footer.PrintDebugString();
-
-  std::cout << std::endl << "=== Stripe Statistics ===" << std::endl;
-
-  StripeInformation stripe ;
-  Stream section;
-  ColumnEncoding encoding;
-  for (int stripeIx=0; stripeIx<footer.stripes_size(); stripeIx++) {
-      std::cout << "Stripe " << stripeIx+1 <<": " << std::endl ;
-      stripe = footer.stripes(stripeIx);
-      stripe.PrintDebugString();
-
-      std::streamoff offset =
-        static_cast<std::streamoff>(stripe.offset() + stripe.indexlength() +
-                                    stripe.datalength());
-      std::streamoff tailLength =
-        static_cast<std::streamoff>(stripe.footerlength());
-
-      // read the stripe footer
-      input.seekg(offset);
-      buffer.resize(static_cast<size_t>(tailLength));
-      input.read(buffer.data(), tailLength);
-
-      StripeFooter stripeFooter;
-      stripeFooter.ParseFromArray(buffer.data(), static_cast<int>(tailLength));
-      //stripeFooter.PrintDebugString();
-      uint64_t stripeStart = stripe.offset();
-      uint64_t sectionStart = stripeStart;
-      for (int streamIx=0; streamIx<stripeFooter.streams_size(); streamIx++) {
-          section = stripeFooter.streams(streamIx);
-          std::cout << "    Stream: column " << section.column()
-                    << " section "
-                    << section.kind() << " start: " << sectionStart
-                    << " length " << section.length() << std::endl;
-          sectionStart += section.length();
-      };
-      for (int columnIx=0; columnIx<stripeFooter.columns_size();
-           columnIx++) {
-          encoding = stripeFooter.columns(columnIx);
-          std::cout << "    Encoding column " << columnIx << ": "
-                    << encoding.kind() ;
-          if (encoding.kind() == ColumnEncoding_Kind_DICTIONARY ||
-              encoding.kind() == ColumnEncoding_Kind_DICTIONARY_V2)
-              std::cout << "[" << encoding.dictionarysize() << "]";
-          std::cout << std::endl;
-      };
-  };
-
-  uint64_t paddedBytes = getTotalPaddingSize(footer);
-  // empty ORC file is ~45 bytes. Assumption here is file length always >0
-  double percentPadding =
-    static_cast<double>(paddedBytes) * 100 / static_cast<double>(fileSize);
-  std::cout << "File length: " << fileSize << " bytes" << std::endl;
-  std::cout <<"Padding length: " << paddedBytes << " bytes" << std::endl;
-  std::cout <<"Padding ratio: " << std::fixed << std::setprecision(2)
-            << percentPadding << " %" << std::endl;
-  std::cout.precision(origPrecision);
-  std::cout.flags(origFlags);
-  input.close();
+  out << "  \"user metadata\": {";
+  std::list<std::string> keys = reader->getMetadataKeys();
+  uint64_t remaining = keys.size();
+  for(std::list<std::string>::const_iterator itr = keys.begin();
+      itr != keys.end(); ++itr) {
+    out << "\n    \"" << *itr << "\": \""
+              << reader->getMetadataValue(*itr) << "\"";
+    if (--remaining != 0) {
+      out << ",";
+    }
+  }
+  out << "\n  },\n";
+  out << "  \"stripes\": [\n";
+  for(uint64_t i=0; i < stripeCount; ++i) {
+    printStripeInformation(out, i, numberColumns, reader->getStripe(i),
+                           verbose);
+    if (i == stripeCount - 1) {
+      out << "\n";
+    } else {
+      out << ",\n";
+    }
+  }
+  out << "  ]\n";
+  out << "}\n";
 }
 
 int main(int argc, char* argv[]) {
-  GOOGLE_PROTOBUF_VERIFY_VERSION;
-
-  if (argc < 2) {
-    std::cout << "Usage: file-metadata <filename>\n";
-  }
-  try {
-    printMetadata(argv[1]);
-  } catch (std::exception& ex) {
-    std::cerr << "Caught exception: " << ex.what() << "\n";
-    return 1;
+  static struct option longOptions[] = {
+    {"help", no_argument, nullptr, 'h'},
+    {"verbose", no_argument, nullptr, 'v'},
+    {nullptr, 0, nullptr, 0}
+  };
+  bool helpFlag = false;
+  bool verboseFlag = false;
+  int opt;
+  do {
+    opt = getopt_long(argc, argv, "hv", longOptions, nullptr);
+    switch (opt) {
+    case '?':
+    case 'h':
+      helpFlag = true;
+      opt = -1;
+      break;
+    case 'v':
+      verboseFlag = true;
+      break;
+    }
+  } while (opt != -1);
+  argc -= optind;
+  argv += optind;
+
+  if (argc < 1 || helpFlag) {
+    std::cerr
+      << "Usage: file-metadata [-h] [--help] [-v] [--verbose] <filename>\n";
+    exit(1);
+  } else {
+    for(int i=0; i < argc; ++i) {
+      try {
+        printMetadata(std::cout, argv[i], verboseFlag);
+      } catch (std::exception& ex) {
+        std::cerr << "Caught exception in " << argv[i]
+                  << ": " << ex.what() << "\n";
+        return 1;
+      }
+    }
   }
-
-  google::protobuf::ShutdownProtobufLibrary();
-
   return 0;
 }
-
-