You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2017/05/17 18:45:23 UTC
parquet-cpp git commit: PARQUET-992: Do not transitively include zlib.h in public API

Repository: parquet-cpp
Updated Branches:
  refs/heads/master 339287aec -> 6787492ec


PARQUET-992: Do not transitively include zlib.h in public API

Use PIMPL pattern to hide zlib from public API

Author: Wes McKinney <we...@twosigma.com>

Closes #333 from wesm/gzip-pimpl and squashes the following commits:

adbca51 [Wes McKinney] Make the appveyor build matrix a little smaller
7d88204 [Wes McKinney] cpplint
39a85bd [Wes McKinney] Use override for virtuals
1064669 [Wes McKinney] Fix up GZIP pimpl
6de81df [Wes McKinney] WIP converting GZipCodec to PIMPL to hide zlib.h


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/6787492e
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/6787492e
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/6787492e

Branch: refs/heads/master
Commit: 6787492eccef3ba6ff891a6dad4f32ba09d268e7
Parents: 339287a
Author: Wes McKinney <we...@twosigma.com>
Authored: Wed May 17 14:45:18 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Wed May 17 14:45:18 2017 -0400

----------------------------------------------------------------------
 CMakeLists.txt                 |  11 +-
 appveyor.yml                   |  17 ++-
 ci/msvc-build.bat              |   4 +-
 cmake_modules/FindBrotli.cmake |  41 +++---
 src/parquet/arrow/schema.cc    |   7 +-
 src/parquet/arrow/schema.h     |   7 +-
 src/parquet/column/scanner.h   |   5 +-
 src/parquet/compression.cc     | 262 +++++++++++++++++++++---------------
 src/parquet/compression.h      |  55 +++-----
 src/parquet/public-api-test.cc |   6 +
 10 files changed, 226 insertions(+), 189 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c0f3d7..3728a0c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -346,6 +346,10 @@ enable_testing()
 # Dependencies
 ############################################################
 
+include_directories(
+  ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
 if (PARQUET_MINIMAL_DEPENDENCY)
     set(IGNORE_OPTIONAL_PACKAGES ON)
     message(STATUS "Build using minimal dependencies")
@@ -438,13 +442,6 @@ include(GenerateExportHeader)
 add_compiler_export_flags()
 
 ############################################################
-# Includes
-
-include_directories(
-  ${CMAKE_CURRENT_SOURCE_DIR}/src
-)
-
-############################################################
 # "make lint" target
 ############################################################
 if (UNIX)

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/appveyor.yml
----------------------------------------------------------------------
diff --git a/appveyor.yml b/appveyor.yml
index 53d198d..ba78a02 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -23,24 +23,27 @@ environment:
     - GENERATOR: NMake Makefiles
       PYTHON: "3.5"
       ARCH: "64"
-
+      CONFIGURATION: "Release"
+    - GENERATOR: Visual Studio 14 2015 Win64
+      PYTHON: "3.5"
+      ARCH: "64"
+      CONFIGURATION: "Debug"
     - GENERATOR: Visual Studio 14 2015 Win64
       PYTHON: "3.5"
       ARCH: "64"
+      CONFIGURATION: "Release"
+    - GENERATOR: Visual Studio 14 2015 Win64
+      PYTHON: "3.5"
+      ARCH: "64"
+      CONFIGURATION: "Toolchain"
   MSVC_DEFAULT_OPTIONS: ON
   BOOST_ROOT: C:\Libraries\boost_1_63_0
   BOOST_LIBRARYDIR: C:\Libraries\boost_1_63_0\lib64-msvc-14.0
 
-configuration:
-  - Debug
-  - Release
-  - Toolchain
-
 init:
   - set MINICONDA=C:\Miniconda35-x64
   - set PATH=%MINICONDA%;%MINICONDA%/Scripts;%MINICONDA%/Library/bin;%PATH%
   - if "%GENERATOR%"=="NMake Makefiles" call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" x64
-  - if "%CONFIGURATION%"=="Toolchain" conda install -y boost-cpp=1.63 brotli=0.6.0 zlib=1.2.11 snappy=1.1.4 thrift-cpp=0.10.0 -c conda-forge
 
 build_script:
   - call ci\msvc-build.bat

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/ci/msvc-build.bat
----------------------------------------------------------------------
diff --git a/ci/msvc-build.bat b/ci/msvc-build.bat
index 6789b46..04743e6 100644
--- a/ci/msvc-build.bat
+++ b/ci/msvc-build.bat
@@ -23,6 +23,8 @@ cd build
 SET PARQUET_TEST_DATA=%APPVEYOR_BUILD_FOLDER%\data
 
 if "%CONFIGURATION%" == "Toolchain" (
+  conda install -y boost-cpp=1.63 brotli=0.6.0 zlib=1.2.11 snappy=1.1.4 thrift-cpp=0.10.0 -c conda-forge
+
   set PARQUET_BUILD_TOOLCHAIN=%MINICONDA%/Library
 
   cmake -G "%GENERATOR%" ^
@@ -48,4 +50,4 @@ if NOT "%CONFIGURATION%" == "Toolchain" (
   if "%CONFIGURATION%" == "Release" (
     ctest -VV || exit /B
   )
-)
\ No newline at end of file
+)

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/cmake_modules/FindBrotli.cmake
----------------------------------------------------------------------
diff --git a/cmake_modules/FindBrotli.cmake b/cmake_modules/FindBrotli.cmake
index 669c3f3..0a991b4 100644
--- a/cmake_modules/FindBrotli.cmake
+++ b/cmake_modules/FindBrotli.cmake
@@ -38,24 +38,25 @@ elseif ( Brotli_HOME )
     list( APPEND _brotli_roots ${Brotli_HOME} )
 endif()
 
-# Try the parameterized roots, if they exist
-if ( _brotli_roots )
-    find_path( BROTLI_INCLUDE_DIR NAMES brotli/decode.h
-        PATHS ${_brotli_roots} NO_DEFAULT_PATH
-        PATH_SUFFIXES "include" )
-    find_library( BROTLI_LIBRARY_ENC NAMES libbrotlienc.a brotlienc
-        PATHS ${_brotli_roots} NO_DEFAULT_PATH
-        PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" )
-    find_library( BROTLI_LIBRARY_DEC NAMES libbrotlidec.a brotlidec
-        PATHS ${_brotli_roots} NO_DEFAULT_PATH
-        PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" )
-    find_library( BROTLI_LIBRARY_COMMON NAMES libbrotlicommon.a brotlicommon
-        PATHS ${_brotli_roots} NO_DEFAULT_PATH
-        PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" )
-else ()
-    find_path( BROTLI_INCLUDE_DIR NAMES brotli.h )
-    find_library( BROTLI_LIBRARIES NAMES brotlienc )
-endif ()
+find_path( BROTLI_INCLUDE_DIR NAMES brotli/decode.h
+  PATHS ${_brotli_roots}
+  NO_DEFAULT_PATH
+  PATH_SUFFIXES "include" )
+
+find_library( BROTLI_LIBRARY_ENC NAMES libbrotlienc.a brotlienc
+  PATHS ${_brotli_roots}
+  NO_DEFAULT_PATH
+  PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" )
+
+find_library( BROTLI_LIBRARY_DEC NAMES libbrotlidec.a brotlidec
+  PATHS ${_brotli_roots}
+  NO_DEFAULT_PATH
+  PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" )
+
+find_library( BROTLI_LIBRARY_COMMON NAMES libbrotlicommon.a brotlicommon
+  PATHS ${_brotli_roots}
+  NO_DEFAULT_PATH
+  PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" )
 
 set(BROTLI_LIBRARIES ${BROTLI_LIBRARY_ENC} ${BROTLI_LIBRARY_DEC}
     ${BROTLI_LIBRARY_COMMON})
@@ -63,8 +64,6 @@ set(BROTLI_LIBRARIES ${BROTLI_LIBRARY_ENC} ${BROTLI_LIBRARY_DEC}
 if (BROTLI_INCLUDE_DIR AND (PARQUET_MINIMAL_DEPENDENCY OR BROTLI_LIBRARIES))
   set(BROTLI_FOUND TRUE)
   get_filename_component( BROTLI_LIBS ${BROTLI_LIBRARY_ENC} PATH )
-  set(BROTLI_HEADER_NAME brotli.h)
-  set(BROTLI_HEADER ${BROTLI_INCLUDE_DIR}/${BROTLI_HEADER_NAME})
   set(BROTLI_LIB_NAME brotli)
   if (MSVC AND NOT BROTLI_MSVC_STATIC_LIB_SUFFIX)
     set(BROTLI_MSVC_STATIC_LIB_SUFFIX _static)
@@ -84,7 +83,7 @@ endif ()
 if (BROTLI_FOUND)
   if (NOT Brotli_FIND_QUIETLY)
     if (PARQUET_MINIMAL_DEPENDENCY)
-      message(STATUS "Found the Brotli header: ${BROTLI_HEADER}")
+      message(STATUS "Found the Brotli headers: ${BROTLI_INCLUDE_DIR}")
     else ()
       message(STATUS "Found the Brotli library: ${BROTLI_LIBRARIES}")
     endif ()

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/src/parquet/arrow/schema.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/schema.cc b/src/parquet/arrow/schema.cc
index 6aeff17..83968bc 100644
--- a/src/parquet/arrow/schema.cc
+++ b/src/parquet/arrow/schema.cc
@@ -371,13 +371,12 @@ Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
 }
 
 Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
-    const std::vector<int>& column_indices,
-    std::shared_ptr<::arrow::Schema>* out) {
+    const std::vector<int>& column_indices, std::shared_ptr<::arrow::Schema>* out) {
   return FromParquetSchema(parquet_schema, column_indices, nullptr, out);
 }
 
-Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
-    std::shared_ptr<::arrow::Schema>* out) {
+Status FromParquetSchema(
+    const SchemaDescriptor* parquet_schema, std::shared_ptr<::arrow::Schema>* out) {
   return FromParquetSchema(parquet_schema, nullptr, out);
 }
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/src/parquet/arrow/schema.h
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/schema.h b/src/parquet/arrow/schema.h
index 30dee20..69ca661 100644
--- a/src/parquet/arrow/schema.h
+++ b/src/parquet/arrow/schema.h
@@ -59,12 +59,11 @@ namespace arrow {
 
 // Without metadata
 ::arrow::Status PARQUET_EXPORT FromParquetSchema(const SchemaDescriptor* parquet_schema,
-    const std::vector<int>& column_indices,
-    std::shared_ptr<::arrow::Schema>* out);
+    const std::vector<int>& column_indices, std::shared_ptr<::arrow::Schema>* out);
 
 // Without metadata or indices
-::arrow::Status PARQUET_EXPORT FromParquetSchema(const SchemaDescriptor* parquet_schema,
-    std::shared_ptr<::arrow::Schema>* out);
+::arrow::Status PARQUET_EXPORT FromParquetSchema(
+    const SchemaDescriptor* parquet_schema, std::shared_ptr<::arrow::Schema>* out);
 
 ::arrow::Status PARQUET_EXPORT FieldToNode(const std::shared_ptr<::arrow::Field>& field,
     const WriterProperties& properties, schema::NodePtr* out);

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/src/parquet/column/scanner.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/scanner.h b/src/parquet/column/scanner.h
index 75b08b6..914f2ad 100644
--- a/src/parquet/column/scanner.h
+++ b/src/parquet/column/scanner.h
@@ -103,9 +103,8 @@ class PARQUET_EXPORT TypedScanner : public Scanner {
 
   bool NextLevels(int16_t* def_level, int16_t* rep_level) {
     if (level_offset_ == levels_buffered_) {
-      levels_buffered_ = typed_reader_->ReadBatch(
-          batch_size_, def_levels_.data(), rep_levels_.data(),
-          values_, &values_buffered_);
+      levels_buffered_ = typed_reader_->ReadBatch(batch_size_, def_levels_.data(),
+          rep_levels_.data(), values_, &values_buffered_);
 
       value_offset_ = 0;
       level_offset_ = 0;

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/src/parquet/compression.cc
----------------------------------------------------------------------
diff --git a/src/parquet/compression.cc b/src/parquet/compression.cc
index c71f161..7d219fe 100644
--- a/src/parquet/compression.cc
+++ b/src/parquet/compression.cc
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include "parquet/compression.h"
+
 #include <cstdint>
 #include <memory>
 #include <string>
@@ -22,13 +24,15 @@
 #include <brotli/decode.h>
 #include <brotli/encode.h>
 #include <snappy.h>
+#include <zlib.h>
 
-#include "parquet/compression.h"
 #include "parquet/exception.h"
 #include "parquet/types.h"
 
 namespace parquet {
 
+Codec::~Codec() {}
+
 std::unique_ptr<Codec> Codec::Create(Compression::type codec_type) {
   std::unique_ptr<Codec> result;
   switch (codec_type) {
@@ -68,140 +72,188 @@ static constexpr int GZIP_CODEC = 16;
 // Determine if this is libz or gzip from header.
 static constexpr int DETECT_CODEC = 32;
 
-GZipCodec::GZipCodec(Format format)
-    : format_(format), compressor_initialized_(false), decompressor_initialized_(false) {}
-
-GZipCodec::~GZipCodec() {
-  EndCompressor();
-  EndDecompressor();
-}
+class GZipCodec::GZipCodecImpl {
+ public:
+  explicit GZipCodecImpl(GZipCodec::Format format)
+      : format_(format),
+        compressor_initialized_(false),
+        decompressor_initialized_(false) {}
 
-void GZipCodec::InitCompressor() {
-  EndDecompressor();
-  memset(&stream_, 0, sizeof(stream_));
-
-  int ret;
-  // Initialize to run specified format
-  int window_bits = WINDOW_BITS;
-  if (format_ == DEFLATE) {
-    window_bits = -window_bits;
-  } else if (format_ == GZIP) {
-    window_bits += GZIP_CODEC;
+  ~GZipCodecImpl() {
+    EndCompressor();
+    EndDecompressor();
   }
-  if ((ret = deflateInit2(&stream_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, window_bits, 9,
-           Z_DEFAULT_STRATEGY)) != Z_OK) {
-    throw ParquetException("zlib deflateInit failed: " + std::string(stream_.msg));
+
+  void InitCompressor() {
+    EndDecompressor();
+    memset(&stream_, 0, sizeof(stream_));
+
+    int ret;
+    // Initialize to run specified format
+    int window_bits = WINDOW_BITS;
+    if (format_ == DEFLATE) {
+      window_bits = -window_bits;
+    } else if (format_ == GZIP) {
+      window_bits += GZIP_CODEC;
+    }
+    if ((ret = deflateInit2(&stream_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, window_bits, 9,
+             Z_DEFAULT_STRATEGY)) != Z_OK) {
+      throw ParquetException("zlib deflateInit failed: " + std::string(stream_.msg));
+    }
+
+    compressor_initialized_ = true;
   }
 
-  compressor_initialized_ = true;
-}
+  void EndCompressor() {
+    if (compressor_initialized_) { (void)deflateEnd(&stream_); }
+    compressor_initialized_ = false;
+  }
 
-void GZipCodec::EndCompressor() {
-  if (compressor_initialized_) { (void)deflateEnd(&stream_); }
-  compressor_initialized_ = false;
-}
+  void InitDecompressor() {
+    EndCompressor();
+    memset(&stream_, 0, sizeof(stream_));
+    int ret;
 
-void GZipCodec::InitDecompressor() {
-  EndCompressor();
-  memset(&stream_, 0, sizeof(stream_));
-  int ret;
+    // Initialize to run either deflate or zlib/gzip format
+    int window_bits = format_ == DEFLATE ? -WINDOW_BITS : WINDOW_BITS | DETECT_CODEC;
+    if ((ret = inflateInit2(&stream_, window_bits)) != Z_OK) {
+      throw ParquetException("zlib inflateInit failed: " + std::string(stream_.msg));
+    }
+    decompressor_initialized_ = true;
+  }
 
-  // Initialize to run either deflate or zlib/gzip format
-  int window_bits = format_ == DEFLATE ? -WINDOW_BITS : WINDOW_BITS | DETECT_CODEC;
-  if ((ret = inflateInit2(&stream_, window_bits)) != Z_OK) {
-    throw ParquetException("zlib inflateInit failed: " + std::string(stream_.msg));
+  void EndDecompressor() {
+    if (decompressor_initialized_) { (void)inflateEnd(&stream_); }
+    decompressor_initialized_ = false;
   }
-  decompressor_initialized_ = true;
-}
 
-void GZipCodec::EndDecompressor() {
-  if (decompressor_initialized_) { (void)inflateEnd(&stream_); }
-  decompressor_initialized_ = false;
-}
+  void Decompress(int64_t input_length, const uint8_t* input, int64_t output_length,
+      uint8_t* output) {
+    if (!decompressor_initialized_) { InitDecompressor(); }
+    if (output_length == 0) {
+      // The zlib library does not allow *output to be NULL, even when output_length
+      // is 0 (inflate() will return Z_STREAM_ERROR). We don't consider this an
+      // error, so bail early if no output is expected. Note that we don't signal
+      // an error if the input actually contains compressed data.
+      return;
+    }
 
-void GZipCodec::Decompress(
-    int64_t input_length, const uint8_t* input, int64_t output_length, uint8_t* output) {
-  if (!decompressor_initialized_) { InitDecompressor(); }
-  if (output_length == 0) {
-    // The zlib library does not allow *output to be NULL, even when output_length
-    // is 0 (inflate() will return Z_STREAM_ERROR). We don't consider this an
-    // error, so bail early if no output is expected. Note that we don't signal
-    // an error if the input actually contains compressed data.
-    return;
+    // Reset the stream for this block
+    if (inflateReset(&stream_) != Z_OK) {
+      throw ParquetException("zlib inflateReset failed: " + std::string(stream_.msg));
+    }
+
+    int ret = 0;
+    // gzip can run in streaming mode or non-streaming mode.  We only
+    // support the non-streaming use case where we present it the entire
+    // compressed input and a buffer big enough to contain the entire
+    // compressed output.  In the case where we don't know the output,
+    // we just make a bigger buffer and try the non-streaming mode
+    // from the beginning again.
+    while (ret != Z_STREAM_END) {
+      stream_.next_in = const_cast<Bytef*>(reinterpret_cast<const Bytef*>(input));
+      stream_.avail_in = input_length;
+      stream_.next_out = reinterpret_cast<Bytef*>(output);
+      stream_.avail_out = output_length;
+
+      // We know the output size.  In this case, we can use Z_FINISH
+      // which is more efficient.
+      ret = inflate(&stream_, Z_FINISH);
+      if (ret == Z_STREAM_END || ret != Z_OK) break;
+
+      // Failure, buffer was too small
+      std::stringstream ss;
+      ss << "Too small a buffer passed to GZipCodec. InputLength=" << input_length
+         << " OutputLength=" << output_length;
+      throw ParquetException(ss.str());
+    }
+
+    // Failure for some other reason
+    if (ret != Z_STREAM_END) {
+      std::stringstream ss;
+      ss << "GZipCodec failed: ";
+      if (stream_.msg != NULL) ss << stream_.msg;
+      throw ParquetException(ss.str());
+    }
   }
 
-  // Reset the stream for this block
-  if (inflateReset(&stream_) != Z_OK) {
-    throw ParquetException("zlib inflateReset failed: " + std::string(stream_.msg));
+  int64_t MaxCompressedLen(int64_t input_length, const uint8_t* input) {
+    // Most be in compression mode
+    if (!compressor_initialized_) { InitCompressor(); }
+    // TODO(wesm): deal with zlib < 1.2.3 (see Impala codebase)
+    return deflateBound(&stream_, static_cast<uLong>(input_length));
   }
 
-  int ret = 0;
-  // gzip can run in streaming mode or non-streaming mode.  We only
-  // support the non-streaming use case where we present it the entire
-  // compressed input and a buffer big enough to contain the entire
-  // compressed output.  In the case where we don't know the output,
-  // we just make a bigger buffer and try the non-streaming mode
-  // from the beginning again.
-  while (ret != Z_STREAM_END) {
+  int64_t Compress(int64_t input_length, const uint8_t* input, int64_t output_length,
+      uint8_t* output) {
+    if (!compressor_initialized_) { InitCompressor(); }
     stream_.next_in = const_cast<Bytef*>(reinterpret_cast<const Bytef*>(input));
     stream_.avail_in = input_length;
     stream_.next_out = reinterpret_cast<Bytef*>(output);
     stream_.avail_out = output_length;
 
-    // We know the output size.  In this case, we can use Z_FINISH
-    // which is more efficient.
-    ret = inflate(&stream_, Z_FINISH);
-    if (ret == Z_STREAM_END || ret != Z_OK) break;
+    int64_t ret = 0;
+    if ((ret = deflate(&stream_, Z_FINISH)) != Z_STREAM_END) {
+      if (ret == Z_OK) {
+        // will return Z_OK (and stream.msg NOT set) if stream.avail_out is too
+        // small
+        throw ParquetException("zlib deflate failed, output buffer too small");
+      }
+      std::stringstream ss;
+      ss << "zlib deflate failed: " << stream_.msg;
+      throw ParquetException(ss.str());
+    }
 
-    // Failure, buffer was too small
-    std::stringstream ss;
-    ss << "Too small a buffer passed to GZipCodec. InputLength=" << input_length
-       << " OutputLength=" << output_length;
-    throw ParquetException(ss.str());
-  }
+    if (deflateReset(&stream_) != Z_OK) {
+      throw ParquetException("zlib deflateReset failed: " + std::string(stream_.msg));
+    }
 
-  // Failure for some other reason
-  if (ret != Z_STREAM_END) {
-    std::stringstream ss;
-    ss << "GZipCodec failed: ";
-    if (stream_.msg != NULL) ss << stream_.msg;
-    throw ParquetException(ss.str());
+    // Actual output length
+    return output_length - stream_.avail_out;
   }
+
+ private:
+  // zlib is stateful and the z_stream state variable must be initialized
+  // before
+  z_stream stream_;
+
+  // Realistically, this will always be GZIP, but we leave the option open to
+  // configure
+  GZipCodec::Format format_;
+
+  // These variables are mutually exclusive. When the codec is in "compressor"
+  // state, compressor_initialized_ is true while decompressor_initialized_ is
+  // false. When it's decompressing, the opposite is true.
+  //
+  // Indeed, this is slightly hacky, but the alternative is having separate
+  // Compressor and Decompressor classes. If this ever becomes an issue, we can
+  // perform the refactoring then
+  bool compressor_initialized_;
+  bool decompressor_initialized_;
+};
+
+GZipCodec::GZipCodec(Format format) {
+  impl_.reset(new GZipCodecImpl(format));
+}
+
+GZipCodec::~GZipCodec() {}
+
+void GZipCodec::Decompress(
+    int64_t input_length, const uint8_t* input, int64_t output_length, uint8_t* output) {
+  return impl_->Decompress(input_length, input, output_length, output);
 }
 
 int64_t GZipCodec::MaxCompressedLen(int64_t input_length, const uint8_t* input) {
-  // Most be in compression mode
-  if (!compressor_initialized_) { InitCompressor(); }
-  // TODO(wesm): deal with zlib < 1.2.3 (see Impala codebase)
-  return deflateBound(&stream_, static_cast<uLong>(input_length));
+  return impl_->MaxCompressedLen(input_length, input);
 }
 
 int64_t GZipCodec::Compress(
     int64_t input_length, const uint8_t* input, int64_t output_length, uint8_t* output) {
-  if (!compressor_initialized_) { InitCompressor(); }
-  stream_.next_in = const_cast<Bytef*>(reinterpret_cast<const Bytef*>(input));
-  stream_.avail_in = input_length;
-  stream_.next_out = reinterpret_cast<Bytef*>(output);
-  stream_.avail_out = output_length;
-
-  int64_t ret = 0;
-  if ((ret = deflate(&stream_, Z_FINISH)) != Z_STREAM_END) {
-    if (ret == Z_OK) {
-      // will return Z_OK (and stream.msg NOT set) if stream.avail_out is too
-      // small
-      throw ParquetException("zlib deflate failed, output buffer too small");
-    }
-    std::stringstream ss;
-    ss << "zlib deflate failed: " << stream_.msg;
-    throw ParquetException(ss.str());
-  }
-
-  if (deflateReset(&stream_) != Z_OK) {
-    throw ParquetException("zlib deflateReset failed: " + std::string(stream_.msg));
-  }
+  return impl_->Compress(input_length, input, output_length, output);
+}
 
-  // Actual output length
-  return output_length - stream_.avail_out;
+const char* GZipCodec::name() const {
+  return "gzip";
 }
 
 // ----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/src/parquet/compression.h
----------------------------------------------------------------------
diff --git a/src/parquet/compression.h b/src/parquet/compression.h
index f006e54..f0a38b6 100644
--- a/src/parquet/compression.h
+++ b/src/parquet/compression.h
@@ -18,19 +18,18 @@
 #ifndef PARQUET_COMPRESSION_CODEC_H
 #define PARQUET_COMPRESSION_CODEC_H
 
-#include <zlib.h>
-
 #include <cstdint>
 #include <memory>
 
 #include "parquet/exception.h"
 #include "parquet/types.h"
+#include "parquet/util/visibility.h"
 
 namespace parquet {
 
-class Codec {
+class PARQUET_EXPORT Codec {
  public:
-  virtual ~Codec() {}
+  virtual ~Codec();
 
   static std::unique_ptr<Codec> Create(Compression::type codec);
 
@@ -48,15 +47,15 @@ class Codec {
 // Snappy codec.
 class PARQUET_EXPORT SnappyCodec : public Codec {
  public:
-  virtual void Decompress(int64_t input_len, const uint8_t* input, int64_t output_len,
-      uint8_t* output_buffer);
+  void Decompress(int64_t input_len, const uint8_t* input, int64_t output_len,
+      uint8_t* output_buffer) override;
 
-  virtual int64_t Compress(int64_t input_len, const uint8_t* input,
-      int64_t output_buffer_len, uint8_t* output_buffer);
+  int64_t Compress(int64_t input_len, const uint8_t* input,
+      int64_t output_buffer_len, uint8_t* output_buffer) override;
 
-  virtual int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input);
+  int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override;
 
-  virtual const char* name() const { return "snappy"; }
+  const char* name() const { return "snappy"; }
 };
 
 // Brotli codec.
@@ -86,38 +85,20 @@ class PARQUET_EXPORT GZipCodec : public Codec {
   explicit GZipCodec(Format format = GZIP);
   virtual ~GZipCodec();
 
-  virtual void Decompress(int64_t input_len, const uint8_t* input, int64_t output_len,
-      uint8_t* output_buffer);
+  void Decompress(int64_t input_len, const uint8_t* input, int64_t output_len,
+      uint8_t* output_buffer) override;
 
-  virtual int64_t Compress(int64_t input_len, const uint8_t* input,
-      int64_t output_buffer_len, uint8_t* output_buffer);
+  int64_t Compress(int64_t input_len, const uint8_t* input,
+      int64_t output_buffer_len, uint8_t* output_buffer) override;
 
-  virtual int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input);
+  int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override;
 
-  virtual const char* name() const { return "gzip"; }
+  const char* name() const override;
 
  private:
-  // zlib is stateful and the z_stream state variable must be initialized
-  // before
-  z_stream stream_;
-
-  // Realistically, this will always be GZIP, but we leave the option open to
-  // configure
-  Format format_;
-
-  // These variables are mutually exclusive. When the codec is in "compressor"
-  // state, compressor_initialized_ is true while decompressor_initialized_ is
-  // false. When it's decompressing, the opposite is true.
-  //
-  // Indeed, this is slightly hacky, but the alternative is having separate
-  // Compressor and Decompressor classes. If this ever becomes an issue, we can
-  // perform the refactoring then
-  void InitCompressor();
-  void InitDecompressor();
-  void EndCompressor();
-  void EndDecompressor();
-  bool compressor_initialized_;
-  bool decompressor_initialized_;
+  // The gzip compressor is stateful
+  class GZipCodecImpl;
+  std::unique_ptr<GZipCodecImpl> impl_;
 };
 
 }  // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/src/parquet/public-api-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/public-api-test.cc b/src/parquet/public-api-test.cc
index a5c88da..4d6f675 100644
--- a/src/parquet/public-api-test.cc
+++ b/src/parquet/public-api-test.cc
@@ -34,6 +34,12 @@ TEST(TestPublicAPI, DoesNotExportDCHECK) {
 #endif
 }
 
+TEST(TestPublicAPI, DoesNotIncludeZlib) {
+#ifdef ZLIB_H
+  FAIL() << "zlib.h should not be transitively included";
+#endif
+}
+
 void ThrowsParquetException() {
   throw parquet::ParquetException("This function throws");
 }