You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2017/05/17 18:45:23 UTC
parquet-cpp git commit: PARQUET-992: Do not transitively include
zlib.h in public API
Repository: parquet-cpp
Updated Branches:
refs/heads/master 339287aec -> 6787492ec
PARQUET-992: Do not transitively include zlib.h in public API
Use PIMPL pattern to hide zlib from public API
Author: Wes McKinney <we...@twosigma.com>
Closes #333 from wesm/gzip-pimpl and squashes the following commits:
adbca51 [Wes McKinney] Make the appveyor build matrix a little smaller
7d88204 [Wes McKinney] cpplint
39a85bd [Wes McKinney] Use override for virtuals
1064669 [Wes McKinney] Fix up GZIP pimpl
6de81df [Wes McKinney] WIP converting GZipCodec to PIMPL to hide zlib.h
Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/6787492e
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/6787492e
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/6787492e
Branch: refs/heads/master
Commit: 6787492eccef3ba6ff891a6dad4f32ba09d268e7
Parents: 339287a
Author: Wes McKinney <we...@twosigma.com>
Authored: Wed May 17 14:45:18 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Wed May 17 14:45:18 2017 -0400
----------------------------------------------------------------------
CMakeLists.txt | 11 +-
appveyor.yml | 17 ++-
ci/msvc-build.bat | 4 +-
cmake_modules/FindBrotli.cmake | 41 +++---
src/parquet/arrow/schema.cc | 7 +-
src/parquet/arrow/schema.h | 7 +-
src/parquet/column/scanner.h | 5 +-
src/parquet/compression.cc | 262 +++++++++++++++++++++---------------
src/parquet/compression.h | 55 +++-----
src/parquet/public-api-test.cc | 6 +
10 files changed, 226 insertions(+), 189 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c0f3d7..3728a0c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -346,6 +346,10 @@ enable_testing()
# Dependencies
############################################################
+include_directories(
+ ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
if (PARQUET_MINIMAL_DEPENDENCY)
set(IGNORE_OPTIONAL_PACKAGES ON)
message(STATUS "Build using minimal dependencies")
@@ -438,13 +442,6 @@ include(GenerateExportHeader)
add_compiler_export_flags()
############################################################
-# Includes
-
-include_directories(
- ${CMAKE_CURRENT_SOURCE_DIR}/src
-)
-
-############################################################
# "make lint" target
############################################################
if (UNIX)
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/appveyor.yml
----------------------------------------------------------------------
diff --git a/appveyor.yml b/appveyor.yml
index 53d198d..ba78a02 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -23,24 +23,27 @@ environment:
- GENERATOR: NMake Makefiles
PYTHON: "3.5"
ARCH: "64"
-
+ CONFIGURATION: "Release"
+ - GENERATOR: Visual Studio 14 2015 Win64
+ PYTHON: "3.5"
+ ARCH: "64"
+ CONFIGURATION: "Debug"
- GENERATOR: Visual Studio 14 2015 Win64
PYTHON: "3.5"
ARCH: "64"
+ CONFIGURATION: "Release"
+ - GENERATOR: Visual Studio 14 2015 Win64
+ PYTHON: "3.5"
+ ARCH: "64"
+ CONFIGURATION: "Toolchain"
MSVC_DEFAULT_OPTIONS: ON
BOOST_ROOT: C:\Libraries\boost_1_63_0
BOOST_LIBRARYDIR: C:\Libraries\boost_1_63_0\lib64-msvc-14.0
-configuration:
- - Debug
- - Release
- - Toolchain
-
init:
- set MINICONDA=C:\Miniconda35-x64
- set PATH=%MINICONDA%;%MINICONDA%/Scripts;%MINICONDA%/Library/bin;%PATH%
- if "%GENERATOR%"=="NMake Makefiles" call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" x64
- - if "%CONFIGURATION%"=="Toolchain" conda install -y boost-cpp=1.63 brotli=0.6.0 zlib=1.2.11 snappy=1.1.4 thrift-cpp=0.10.0 -c conda-forge
build_script:
- call ci\msvc-build.bat
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/ci/msvc-build.bat
----------------------------------------------------------------------
diff --git a/ci/msvc-build.bat b/ci/msvc-build.bat
index 6789b46..04743e6 100644
--- a/ci/msvc-build.bat
+++ b/ci/msvc-build.bat
@@ -23,6 +23,8 @@ cd build
SET PARQUET_TEST_DATA=%APPVEYOR_BUILD_FOLDER%\data
if "%CONFIGURATION%" == "Toolchain" (
+ conda install -y boost-cpp=1.63 brotli=0.6.0 zlib=1.2.11 snappy=1.1.4 thrift-cpp=0.10.0 -c conda-forge
+
set PARQUET_BUILD_TOOLCHAIN=%MINICONDA%/Library
cmake -G "%GENERATOR%" ^
@@ -48,4 +50,4 @@ if NOT "%CONFIGURATION%" == "Toolchain" (
if "%CONFIGURATION%" == "Release" (
ctest -VV || exit /B
)
-)
\ No newline at end of file
+)
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/cmake_modules/FindBrotli.cmake
----------------------------------------------------------------------
diff --git a/cmake_modules/FindBrotli.cmake b/cmake_modules/FindBrotli.cmake
index 669c3f3..0a991b4 100644
--- a/cmake_modules/FindBrotli.cmake
+++ b/cmake_modules/FindBrotli.cmake
@@ -38,24 +38,25 @@ elseif ( Brotli_HOME )
list( APPEND _brotli_roots ${Brotli_HOME} )
endif()
-# Try the parameterized roots, if they exist
-if ( _brotli_roots )
- find_path( BROTLI_INCLUDE_DIR NAMES brotli/decode.h
- PATHS ${_brotli_roots} NO_DEFAULT_PATH
- PATH_SUFFIXES "include" )
- find_library( BROTLI_LIBRARY_ENC NAMES libbrotlienc.a brotlienc
- PATHS ${_brotli_roots} NO_DEFAULT_PATH
- PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" )
- find_library( BROTLI_LIBRARY_DEC NAMES libbrotlidec.a brotlidec
- PATHS ${_brotli_roots} NO_DEFAULT_PATH
- PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" )
- find_library( BROTLI_LIBRARY_COMMON NAMES libbrotlicommon.a brotlicommon
- PATHS ${_brotli_roots} NO_DEFAULT_PATH
- PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" )
-else ()
- find_path( BROTLI_INCLUDE_DIR NAMES brotli.h )
- find_library( BROTLI_LIBRARIES NAMES brotlienc )
-endif ()
+find_path( BROTLI_INCLUDE_DIR NAMES brotli/decode.h
+ PATHS ${_brotli_roots}
+ NO_DEFAULT_PATH
+ PATH_SUFFIXES "include" )
+
+find_library( BROTLI_LIBRARY_ENC NAMES libbrotlienc.a brotlienc
+ PATHS ${_brotli_roots}
+ NO_DEFAULT_PATH
+ PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" )
+
+find_library( BROTLI_LIBRARY_DEC NAMES libbrotlidec.a brotlidec
+ PATHS ${_brotli_roots}
+ NO_DEFAULT_PATH
+ PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" )
+
+find_library( BROTLI_LIBRARY_COMMON NAMES libbrotlicommon.a brotlicommon
+ PATHS ${_brotli_roots}
+ NO_DEFAULT_PATH
+ PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" )
set(BROTLI_LIBRARIES ${BROTLI_LIBRARY_ENC} ${BROTLI_LIBRARY_DEC}
${BROTLI_LIBRARY_COMMON})
@@ -63,8 +64,6 @@ set(BROTLI_LIBRARIES ${BROTLI_LIBRARY_ENC} ${BROTLI_LIBRARY_DEC}
if (BROTLI_INCLUDE_DIR AND (PARQUET_MINIMAL_DEPENDENCY OR BROTLI_LIBRARIES))
set(BROTLI_FOUND TRUE)
get_filename_component( BROTLI_LIBS ${BROTLI_LIBRARY_ENC} PATH )
- set(BROTLI_HEADER_NAME brotli.h)
- set(BROTLI_HEADER ${BROTLI_INCLUDE_DIR}/${BROTLI_HEADER_NAME})
set(BROTLI_LIB_NAME brotli)
if (MSVC AND NOT BROTLI_MSVC_STATIC_LIB_SUFFIX)
set(BROTLI_MSVC_STATIC_LIB_SUFFIX _static)
@@ -84,7 +83,7 @@ endif ()
if (BROTLI_FOUND)
if (NOT Brotli_FIND_QUIETLY)
if (PARQUET_MINIMAL_DEPENDENCY)
- message(STATUS "Found the Brotli header: ${BROTLI_HEADER}")
+ message(STATUS "Found the Brotli headers: ${BROTLI_INCLUDE_DIR}")
else ()
message(STATUS "Found the Brotli library: ${BROTLI_LIBRARIES}")
endif ()
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/src/parquet/arrow/schema.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/schema.cc b/src/parquet/arrow/schema.cc
index 6aeff17..83968bc 100644
--- a/src/parquet/arrow/schema.cc
+++ b/src/parquet/arrow/schema.cc
@@ -371,13 +371,12 @@ Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
}
Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
- const std::vector<int>& column_indices,
- std::shared_ptr<::arrow::Schema>* out) {
+ const std::vector<int>& column_indices, std::shared_ptr<::arrow::Schema>* out) {
return FromParquetSchema(parquet_schema, column_indices, nullptr, out);
}
-Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
- std::shared_ptr<::arrow::Schema>* out) {
+Status FromParquetSchema(
+ const SchemaDescriptor* parquet_schema, std::shared_ptr<::arrow::Schema>* out) {
return FromParquetSchema(parquet_schema, nullptr, out);
}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/src/parquet/arrow/schema.h
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/schema.h b/src/parquet/arrow/schema.h
index 30dee20..69ca661 100644
--- a/src/parquet/arrow/schema.h
+++ b/src/parquet/arrow/schema.h
@@ -59,12 +59,11 @@ namespace arrow {
// Without metadata
::arrow::Status PARQUET_EXPORT FromParquetSchema(const SchemaDescriptor* parquet_schema,
- const std::vector<int>& column_indices,
- std::shared_ptr<::arrow::Schema>* out);
+ const std::vector<int>& column_indices, std::shared_ptr<::arrow::Schema>* out);
// Without metadata or indices
-::arrow::Status PARQUET_EXPORT FromParquetSchema(const SchemaDescriptor* parquet_schema,
- std::shared_ptr<::arrow::Schema>* out);
+::arrow::Status PARQUET_EXPORT FromParquetSchema(
+ const SchemaDescriptor* parquet_schema, std::shared_ptr<::arrow::Schema>* out);
::arrow::Status PARQUET_EXPORT FieldToNode(const std::shared_ptr<::arrow::Field>& field,
const WriterProperties& properties, schema::NodePtr* out);
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/src/parquet/column/scanner.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/scanner.h b/src/parquet/column/scanner.h
index 75b08b6..914f2ad 100644
--- a/src/parquet/column/scanner.h
+++ b/src/parquet/column/scanner.h
@@ -103,9 +103,8 @@ class PARQUET_EXPORT TypedScanner : public Scanner {
bool NextLevels(int16_t* def_level, int16_t* rep_level) {
if (level_offset_ == levels_buffered_) {
- levels_buffered_ = typed_reader_->ReadBatch(
- batch_size_, def_levels_.data(), rep_levels_.data(),
- values_, &values_buffered_);
+ levels_buffered_ = typed_reader_->ReadBatch(batch_size_, def_levels_.data(),
+ rep_levels_.data(), values_, &values_buffered_);
value_offset_ = 0;
level_offset_ = 0;
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/src/parquet/compression.cc
----------------------------------------------------------------------
diff --git a/src/parquet/compression.cc b/src/parquet/compression.cc
index c71f161..7d219fe 100644
--- a/src/parquet/compression.cc
+++ b/src/parquet/compression.cc
@@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.
+#include "parquet/compression.h"
+
#include <cstdint>
#include <memory>
#include <string>
@@ -22,13 +24,15 @@
#include <brotli/decode.h>
#include <brotli/encode.h>
#include <snappy.h>
+#include <zlib.h>
-#include "parquet/compression.h"
#include "parquet/exception.h"
#include "parquet/types.h"
namespace parquet {
+Codec::~Codec() {}
+
std::unique_ptr<Codec> Codec::Create(Compression::type codec_type) {
std::unique_ptr<Codec> result;
switch (codec_type) {
@@ -68,140 +72,188 @@ static constexpr int GZIP_CODEC = 16;
// Determine if this is libz or gzip from header.
static constexpr int DETECT_CODEC = 32;
-GZipCodec::GZipCodec(Format format)
- : format_(format), compressor_initialized_(false), decompressor_initialized_(false) {}
-
-GZipCodec::~GZipCodec() {
- EndCompressor();
- EndDecompressor();
-}
+class GZipCodec::GZipCodecImpl {
+ public:
+ explicit GZipCodecImpl(GZipCodec::Format format)
+ : format_(format),
+ compressor_initialized_(false),
+ decompressor_initialized_(false) {}
-void GZipCodec::InitCompressor() {
- EndDecompressor();
- memset(&stream_, 0, sizeof(stream_));
-
- int ret;
- // Initialize to run specified format
- int window_bits = WINDOW_BITS;
- if (format_ == DEFLATE) {
- window_bits = -window_bits;
- } else if (format_ == GZIP) {
- window_bits += GZIP_CODEC;
+ ~GZipCodecImpl() {
+ EndCompressor();
+ EndDecompressor();
}
- if ((ret = deflateInit2(&stream_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, window_bits, 9,
- Z_DEFAULT_STRATEGY)) != Z_OK) {
- throw ParquetException("zlib deflateInit failed: " + std::string(stream_.msg));
+
+ void InitCompressor() {
+ EndDecompressor();
+ memset(&stream_, 0, sizeof(stream_));
+
+ int ret;
+ // Initialize to run specified format
+ int window_bits = WINDOW_BITS;
+ if (format_ == DEFLATE) {
+ window_bits = -window_bits;
+ } else if (format_ == GZIP) {
+ window_bits += GZIP_CODEC;
+ }
+ if ((ret = deflateInit2(&stream_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, window_bits, 9,
+ Z_DEFAULT_STRATEGY)) != Z_OK) {
+ throw ParquetException("zlib deflateInit failed: " + std::string(stream_.msg));
+ }
+
+ compressor_initialized_ = true;
}
- compressor_initialized_ = true;
-}
+ void EndCompressor() {
+ if (compressor_initialized_) { (void)deflateEnd(&stream_); }
+ compressor_initialized_ = false;
+ }
-void GZipCodec::EndCompressor() {
- if (compressor_initialized_) { (void)deflateEnd(&stream_); }
- compressor_initialized_ = false;
-}
+ void InitDecompressor() {
+ EndCompressor();
+ memset(&stream_, 0, sizeof(stream_));
+ int ret;
-void GZipCodec::InitDecompressor() {
- EndCompressor();
- memset(&stream_, 0, sizeof(stream_));
- int ret;
+ // Initialize to run either deflate or zlib/gzip format
+ int window_bits = format_ == DEFLATE ? -WINDOW_BITS : WINDOW_BITS | DETECT_CODEC;
+ if ((ret = inflateInit2(&stream_, window_bits)) != Z_OK) {
+ throw ParquetException("zlib inflateInit failed: " + std::string(stream_.msg));
+ }
+ decompressor_initialized_ = true;
+ }
- // Initialize to run either deflate or zlib/gzip format
- int window_bits = format_ == DEFLATE ? -WINDOW_BITS : WINDOW_BITS | DETECT_CODEC;
- if ((ret = inflateInit2(&stream_, window_bits)) != Z_OK) {
- throw ParquetException("zlib inflateInit failed: " + std::string(stream_.msg));
+ void EndDecompressor() {
+ if (decompressor_initialized_) { (void)inflateEnd(&stream_); }
+ decompressor_initialized_ = false;
}
- decompressor_initialized_ = true;
-}
-void GZipCodec::EndDecompressor() {
- if (decompressor_initialized_) { (void)inflateEnd(&stream_); }
- decompressor_initialized_ = false;
-}
+ void Decompress(int64_t input_length, const uint8_t* input, int64_t output_length,
+ uint8_t* output) {
+ if (!decompressor_initialized_) { InitDecompressor(); }
+ if (output_length == 0) {
+ // The zlib library does not allow *output to be NULL, even when output_length
+ // is 0 (inflate() will return Z_STREAM_ERROR). We don't consider this an
+ // error, so bail early if no output is expected. Note that we don't signal
+ // an error if the input actually contains compressed data.
+ return;
+ }
-void GZipCodec::Decompress(
- int64_t input_length, const uint8_t* input, int64_t output_length, uint8_t* output) {
- if (!decompressor_initialized_) { InitDecompressor(); }
- if (output_length == 0) {
- // The zlib library does not allow *output to be NULL, even when output_length
- // is 0 (inflate() will return Z_STREAM_ERROR). We don't consider this an
- // error, so bail early if no output is expected. Note that we don't signal
- // an error if the input actually contains compressed data.
- return;
+ // Reset the stream for this block
+ if (inflateReset(&stream_) != Z_OK) {
+ throw ParquetException("zlib inflateReset failed: " + std::string(stream_.msg));
+ }
+
+ int ret = 0;
+ // gzip can run in streaming mode or non-streaming mode. We only
+ // support the non-streaming use case where we present it the entire
+ // compressed input and a buffer big enough to contain the entire
+ // compressed output. In the case where we don't know the output,
+ // we just make a bigger buffer and try the non-streaming mode
+ // from the beginning again.
+ while (ret != Z_STREAM_END) {
+ stream_.next_in = const_cast<Bytef*>(reinterpret_cast<const Bytef*>(input));
+ stream_.avail_in = input_length;
+ stream_.next_out = reinterpret_cast<Bytef*>(output);
+ stream_.avail_out = output_length;
+
+ // We know the output size. In this case, we can use Z_FINISH
+ // which is more efficient.
+ ret = inflate(&stream_, Z_FINISH);
+ if (ret == Z_STREAM_END || ret != Z_OK) break;
+
+ // Failure, buffer was too small
+ std::stringstream ss;
+ ss << "Too small a buffer passed to GZipCodec. InputLength=" << input_length
+ << " OutputLength=" << output_length;
+ throw ParquetException(ss.str());
+ }
+
+ // Failure for some other reason
+ if (ret != Z_STREAM_END) {
+ std::stringstream ss;
+ ss << "GZipCodec failed: ";
+ if (stream_.msg != NULL) ss << stream_.msg;
+ throw ParquetException(ss.str());
+ }
}
- // Reset the stream for this block
- if (inflateReset(&stream_) != Z_OK) {
- throw ParquetException("zlib inflateReset failed: " + std::string(stream_.msg));
+ int64_t MaxCompressedLen(int64_t input_length, const uint8_t* input) {
+ // Most be in compression mode
+ if (!compressor_initialized_) { InitCompressor(); }
+ // TODO(wesm): deal with zlib < 1.2.3 (see Impala codebase)
+ return deflateBound(&stream_, static_cast<uLong>(input_length));
}
- int ret = 0;
- // gzip can run in streaming mode or non-streaming mode. We only
- // support the non-streaming use case where we present it the entire
- // compressed input and a buffer big enough to contain the entire
- // compressed output. In the case where we don't know the output,
- // we just make a bigger buffer and try the non-streaming mode
- // from the beginning again.
- while (ret != Z_STREAM_END) {
+ int64_t Compress(int64_t input_length, const uint8_t* input, int64_t output_length,
+ uint8_t* output) {
+ if (!compressor_initialized_) { InitCompressor(); }
stream_.next_in = const_cast<Bytef*>(reinterpret_cast<const Bytef*>(input));
stream_.avail_in = input_length;
stream_.next_out = reinterpret_cast<Bytef*>(output);
stream_.avail_out = output_length;
- // We know the output size. In this case, we can use Z_FINISH
- // which is more efficient.
- ret = inflate(&stream_, Z_FINISH);
- if (ret == Z_STREAM_END || ret != Z_OK) break;
+ int64_t ret = 0;
+ if ((ret = deflate(&stream_, Z_FINISH)) != Z_STREAM_END) {
+ if (ret == Z_OK) {
+ // will return Z_OK (and stream.msg NOT set) if stream.avail_out is too
+ // small
+ throw ParquetException("zlib deflate failed, output buffer too small");
+ }
+ std::stringstream ss;
+ ss << "zlib deflate failed: " << stream_.msg;
+ throw ParquetException(ss.str());
+ }
- // Failure, buffer was too small
- std::stringstream ss;
- ss << "Too small a buffer passed to GZipCodec. InputLength=" << input_length
- << " OutputLength=" << output_length;
- throw ParquetException(ss.str());
- }
+ if (deflateReset(&stream_) != Z_OK) {
+ throw ParquetException("zlib deflateReset failed: " + std::string(stream_.msg));
+ }
- // Failure for some other reason
- if (ret != Z_STREAM_END) {
- std::stringstream ss;
- ss << "GZipCodec failed: ";
- if (stream_.msg != NULL) ss << stream_.msg;
- throw ParquetException(ss.str());
+ // Actual output length
+ return output_length - stream_.avail_out;
}
+
+ private:
+ // zlib is stateful and the z_stream state variable must be initialized
+ // before
+ z_stream stream_;
+
+ // Realistically, this will always be GZIP, but we leave the option open to
+ // configure
+ GZipCodec::Format format_;
+
+ // These variables are mutually exclusive. When the codec is in "compressor"
+ // state, compressor_initialized_ is true while decompressor_initialized_ is
+ // false. When it's decompressing, the opposite is true.
+ //
+ // Indeed, this is slightly hacky, but the alternative is having separate
+ // Compressor and Decompressor classes. If this ever becomes an issue, we can
+ // perform the refactoring then
+ bool compressor_initialized_;
+ bool decompressor_initialized_;
+};
+
+GZipCodec::GZipCodec(Format format) {
+ impl_.reset(new GZipCodecImpl(format));
+}
+
+GZipCodec::~GZipCodec() {}
+
+void GZipCodec::Decompress(
+ int64_t input_length, const uint8_t* input, int64_t output_length, uint8_t* output) {
+ return impl_->Decompress(input_length, input, output_length, output);
}
int64_t GZipCodec::MaxCompressedLen(int64_t input_length, const uint8_t* input) {
- // Most be in compression mode
- if (!compressor_initialized_) { InitCompressor(); }
- // TODO(wesm): deal with zlib < 1.2.3 (see Impala codebase)
- return deflateBound(&stream_, static_cast<uLong>(input_length));
+ return impl_->MaxCompressedLen(input_length, input);
}
int64_t GZipCodec::Compress(
int64_t input_length, const uint8_t* input, int64_t output_length, uint8_t* output) {
- if (!compressor_initialized_) { InitCompressor(); }
- stream_.next_in = const_cast<Bytef*>(reinterpret_cast<const Bytef*>(input));
- stream_.avail_in = input_length;
- stream_.next_out = reinterpret_cast<Bytef*>(output);
- stream_.avail_out = output_length;
-
- int64_t ret = 0;
- if ((ret = deflate(&stream_, Z_FINISH)) != Z_STREAM_END) {
- if (ret == Z_OK) {
- // will return Z_OK (and stream.msg NOT set) if stream.avail_out is too
- // small
- throw ParquetException("zlib deflate failed, output buffer too small");
- }
- std::stringstream ss;
- ss << "zlib deflate failed: " << stream_.msg;
- throw ParquetException(ss.str());
- }
-
- if (deflateReset(&stream_) != Z_OK) {
- throw ParquetException("zlib deflateReset failed: " + std::string(stream_.msg));
- }
+ return impl_->Compress(input_length, input, output_length, output);
+}
- // Actual output length
- return output_length - stream_.avail_out;
+const char* GZipCodec::name() const {
+ return "gzip";
}
// ----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/src/parquet/compression.h
----------------------------------------------------------------------
diff --git a/src/parquet/compression.h b/src/parquet/compression.h
index f006e54..f0a38b6 100644
--- a/src/parquet/compression.h
+++ b/src/parquet/compression.h
@@ -18,19 +18,18 @@
#ifndef PARQUET_COMPRESSION_CODEC_H
#define PARQUET_COMPRESSION_CODEC_H
-#include <zlib.h>
-
#include <cstdint>
#include <memory>
#include "parquet/exception.h"
#include "parquet/types.h"
+#include "parquet/util/visibility.h"
namespace parquet {
-class Codec {
+class PARQUET_EXPORT Codec {
public:
- virtual ~Codec() {}
+ virtual ~Codec();
static std::unique_ptr<Codec> Create(Compression::type codec);
@@ -48,15 +47,15 @@ class Codec {
// Snappy codec.
class PARQUET_EXPORT SnappyCodec : public Codec {
public:
- virtual void Decompress(int64_t input_len, const uint8_t* input, int64_t output_len,
- uint8_t* output_buffer);
+ void Decompress(int64_t input_len, const uint8_t* input, int64_t output_len,
+ uint8_t* output_buffer) override;
- virtual int64_t Compress(int64_t input_len, const uint8_t* input,
- int64_t output_buffer_len, uint8_t* output_buffer);
+ int64_t Compress(int64_t input_len, const uint8_t* input,
+ int64_t output_buffer_len, uint8_t* output_buffer) override;
- virtual int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input);
+ int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override;
- virtual const char* name() const { return "snappy"; }
+ const char* name() const { return "snappy"; }
};
// Brotli codec.
@@ -86,38 +85,20 @@ class PARQUET_EXPORT GZipCodec : public Codec {
explicit GZipCodec(Format format = GZIP);
virtual ~GZipCodec();
- virtual void Decompress(int64_t input_len, const uint8_t* input, int64_t output_len,
- uint8_t* output_buffer);
+ void Decompress(int64_t input_len, const uint8_t* input, int64_t output_len,
+ uint8_t* output_buffer) override;
- virtual int64_t Compress(int64_t input_len, const uint8_t* input,
- int64_t output_buffer_len, uint8_t* output_buffer);
+ int64_t Compress(int64_t input_len, const uint8_t* input,
+ int64_t output_buffer_len, uint8_t* output_buffer) override;
- virtual int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input);
+ int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override;
- virtual const char* name() const { return "gzip"; }
+ const char* name() const override;
private:
- // zlib is stateful and the z_stream state variable must be initialized
- // before
- z_stream stream_;
-
- // Realistically, this will always be GZIP, but we leave the option open to
- // configure
- Format format_;
-
- // These variables are mutually exclusive. When the codec is in "compressor"
- // state, compressor_initialized_ is true while decompressor_initialized_ is
- // false. When it's decompressing, the opposite is true.
- //
- // Indeed, this is slightly hacky, but the alternative is having separate
- // Compressor and Decompressor classes. If this ever becomes an issue, we can
- // perform the refactoring then
- void InitCompressor();
- void InitDecompressor();
- void EndCompressor();
- void EndDecompressor();
- bool compressor_initialized_;
- bool decompressor_initialized_;
+ // The gzip compressor is stateful
+ class GZipCodecImpl;
+ std::unique_ptr<GZipCodecImpl> impl_;
};
} // namespace parquet
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/src/parquet/public-api-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/public-api-test.cc b/src/parquet/public-api-test.cc
index a5c88da..4d6f675 100644
--- a/src/parquet/public-api-test.cc
+++ b/src/parquet/public-api-test.cc
@@ -34,6 +34,12 @@ TEST(TestPublicAPI, DoesNotExportDCHECK) {
#endif
}
+TEST(TestPublicAPI, DoesNotIncludeZlib) {
+#ifdef ZLIB_H
+ FAIL() << "zlib.h should not be transitively included";
+#endif
+}
+
void ThrowsParquetException() {
throw parquet::ParquetException("This function throws");
}