You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2017/07/03 20:39:14 UTC

arrow git commit: ARROW-600: ZSTD compression lib support

Repository: arrow
Updated Branches:
  refs/heads/master e18abac19 -> cdee23c27


ARROW-600: ZSTD compression lib support

Author: Max Risukhin <ri...@gmail.com>

Closes #807 from MaxRis/ARROW-600 and squashes the following commits:

2fc4578 [Max Risukhin] ARROW-600: ZSTD compression lib support


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/cdee23c2
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/cdee23c2
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/cdee23c2

Branch: refs/heads/master
Commit: cdee23c27ac36f957512e33cc1ee49674c515dc8
Parents: e18abac
Author: Max Risukhin <ri...@gmail.com>
Authored: Mon Jul 3 22:39:09 2017 +0200
Committer: Uwe L. Korn <uw...@xhochy.com>
Committed: Mon Jul 3 22:39:09 2017 +0200

----------------------------------------------------------------------
 cpp/CMakeLists.txt                          |  3 +-
 cpp/build-support/build-zstd-lib.sh         | 16 ++++++
 cpp/cmake_modules/FindZSTD.cmake            | 70 ++++++++++++++++++++++++
 cpp/cmake_modules/ThirdpartyToolchain.cmake | 49 ++++++++++++++++-
 cpp/src/arrow/util/compression-test.cc      |  4 ++
 cpp/src/arrow/util/compression.cc           | 28 ++++++++++
 cpp/src/arrow/util/compression.h            | 16 +++++-
 python/manylinux1/Dockerfile-x86_64_base    |  8 +++
 python/manylinux1/scripts/build_lz4.sh      | 24 ++++++++
 python/manylinux1/scripts/build_zstd.sh     | 25 +++++++++
 10 files changed, 239 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/cdee23c2/cpp/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5745338..28a3bb0 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -530,7 +530,8 @@ set(ARROW_STATIC_LINK_LIBS
   brotli_enc
   brotli_common
   snappy
-  zlib)
+  zlib
+  zstd_static)
 
 add_dependencies(arrow_dependencies ${ARROW_STATIC_LINK_LIBS})
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/cdee23c2/cpp/build-support/build-zstd-lib.sh
----------------------------------------------------------------------
diff --git a/cpp/build-support/build-zstd-lib.sh b/cpp/build-support/build-zstd-lib.sh
new file mode 100755
index 0000000..62805ba
--- /dev/null
+++ b/cpp/build-support/build-zstd-lib.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+export CFLAGS="${CFLAGS} -O3 -fPIC"
+make -j4
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/arrow/blob/cdee23c2/cpp/cmake_modules/FindZSTD.cmake
----------------------------------------------------------------------
diff --git a/cpp/cmake_modules/FindZSTD.cmake b/cpp/cmake_modules/FindZSTD.cmake
new file mode 100644
index 0000000..1fda29e
--- /dev/null
+++ b/cpp/cmake_modules/FindZSTD.cmake
@@ -0,0 +1,70 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# - Find ZSTD (zstd.h, libzstd.a, libzstd.so, and libzstd.so.0)
+# This module defines
+#  ZSTD_INCLUDE_DIR, directory containing headers
+#  ZSTD_SHARED_LIB, path to libzstd shared library
+#  ZSTD_STATIC_LIB, path to libzstd static library
+#  ZSTD_FOUND, whether zstd has been found
+
+if( NOT "${ZSTD_HOME}" STREQUAL "")
+    file( TO_CMAKE_PATH "${ZSTD_HOME}" _native_path )
+    list( APPEND _zstd_roots ${_native_path} )
+elseif ( ZStd_HOME )
+    list( APPEND _zstd_roots ${ZStd_HOME} )
+endif()
+
+if (MSVC AND NOT ZSTD_MSVC_STATIC_LIB_SUFFIX)
+  set(ZSTD_MSVC_STATIC_LIB_SUFFIX "_static")
+endif()
+
+set(ZSTD_STATIC_LIB_SUFFIX
+  "${ZSTD_MSVC_STATIC_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+
+set(ZSTD_STATIC_LIB_NAME
+  ${CMAKE_STATIC_LIBRARY_PREFIX}zstd${ZSTD_STATIC_LIB_SUFFIX})
+
+if ( _zstd_roots )
+  find_path(ZSTD_INCLUDE_DIR NAMES zstd.h
+    PATHS ${_zstd_roots}
+    NO_DEFAULT_PATH
+    PATH_SUFFIXES "include" )
+  find_library(ZSTD_SHARED_LIB NAMES zstd
+    PATHS ${_zstd_roots}
+    NO_DEFAULT_PATH
+    PATH_SUFFIXES "lib" )
+  find_library(ZSTD_STATIC_LIB NAMES ${ZSTD_STATIC_LIB_NAME}
+    PATHS ${_zstd_roots}
+    NO_DEFAULT_PATH
+    PATH_SUFFIXES "lib" )
+else()
+  find_path(ZSTD_INCLUDE_DIR zstd.h
+    # make sure we don't accidentally pick up a different version
+    NO_CMAKE_SYSTEM_PATH
+    NO_SYSTEM_ENVIRONMENT_PATH)
+  find_library(ZSTD_SHARED_LIB zstd
+    NO_CMAKE_SYSTEM_PATH
+    NO_SYSTEM_ENVIRONMENT_PATH)
+  find_library(ZSTD_STATIC_LIB ${ZSTD_STATIC_LIB_NAME}
+    NO_CMAKE_SYSTEM_PATH
+    NO_SYSTEM_ENVIRONMENT_PATH)
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(ZSTD REQUIRED_VARS
+  ZSTD_SHARED_LIB ZSTD_STATIC_LIB ZSTD_INCLUDE_DIR)

http://git-wip-us.apache.org/repos/asf/arrow/blob/cdee23c2/cpp/cmake_modules/ThirdpartyToolchain.cmake
----------------------------------------------------------------------
diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 8573345..33447ae 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -28,6 +28,7 @@ set(JEMALLOC_VERSION "4.4.0")
 set(SNAPPY_VERSION "1.1.3")
 set(BROTLI_VERSION "v0.6.0")
 set(LZ4_VERSION "1.7.5")
+set(ZSTD_VERSION "1.2.0")
 
 string(TOUPPER ${CMAKE_BUILD_TYPE} UPPERCASE_BUILD_TYPE)
 
@@ -49,6 +50,7 @@ if (NOT "$ENV{ARROW_BUILD_TOOLCHAIN}" STREQUAL "")
   set(ZLIB_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}")
   set(BROTLI_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}")
   set(LZ4_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}")
+  set(ZSTD_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}")
 
   if (NOT DEFINED ENV{BOOST_ROOT})
     # Since we have to set this in the environment, we check whether
@@ -89,6 +91,10 @@ if (DEFINED ENV{LZ4_HOME})
   set(LZ4_HOME "$ENV{LZ4_HOME}")
 endif()
 
+if (DEFINED ENV{ZSTD_HOME})
+  set(ZSTD_HOME "$ENV{ZSTD_HOME}")
+endif()
+
 # ----------------------------------------------------------------------
 # Find pthreads
 
@@ -653,8 +659,8 @@ if (NOT LZ4_FOUND)
   set(LZ4_INCLUDE_DIR "${LZ4_BUILD_DIR}/lib")
 
   if (MSVC)
-    set(LZ4_STATIC_LIB "${LZ4_BUILD_DIR}/visual/VS2010/bin/x64_Release/liblz4_static.lib")
-    set(LZ4_BUILD_COMMAND BUILD_COMMAND msbuild.exe /m /p:Configuration=Release /p:Platform=x64 /p:PlatformToolset=v140 /t:Build ${LZ4_BUILD_DIR}/visual/VS2010/lz4.sln)
+    set(LZ4_STATIC_LIB "${LZ4_BUILD_DIR}/visual/VS2010/bin/x64_${CMAKE_BUILD_TYPE}/liblz4_static.lib")
+    set(LZ4_BUILD_COMMAND BUILD_COMMAND msbuild.exe /m /p:Configuration=${CMAKE_BUILD_TYPE} /p:Platform=x64 /p:PlatformToolset=v140 /t:Build ${LZ4_BUILD_DIR}/visual/VS2010/lz4.sln)
   else()
     set(LZ4_STATIC_LIB "${LZ4_BUILD_DIR}/lib/liblz4.a")
     set(LZ4_BUILD_COMMAND BUILD_COMMAND make -j4)
@@ -682,3 +688,42 @@ ADD_THIRDPARTY_LIB(lz4_static
 if (LZ4_VENDORED)
   add_dependencies(lz4_static lz4_ep)
 endif()
+
+# ----------------------------------------------------------------------
+# ZSTD
+
+find_package(ZSTD)
+if (NOT ZSTD_FOUND)
+  set(ZSTD_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/zstd_ep-prefix/src/zstd_ep")
+  set(ZSTD_INCLUDE_DIR "${ZSTD_BUILD_DIR}/lib")
+
+  if (MSVC)
+    set(ZSTD_STATIC_LIB "${ZSTD_BUILD_DIR}/build/VS2010/bin/x64_${CMAKE_BUILD_TYPE}/libzstd_static.lib")
+    set(ZSTD_BUILD_COMMAND BUILD_COMMAND msbuild ${ZSTD_BUILD_DIR}/build/VS2010/zstd.sln /t:Build /v:minimal /p:Configuration=${CMAKE_BUILD_TYPE} /p:Platform=x64 /p:PlatformToolset=v140 /p:OutDir=${ZSTD_BUILD_DIR}/build/VS2010/bin/x64_${CMAKE_BUILD_TYPE}/ /p:SolutionDir=${ZSTD_BUILD_DIR}/build/VS2010/ )
+  else()
+    set(ZSTD_STATIC_LIB "${ZSTD_BUILD_DIR}/lib/libzstd.a")
+    set(ZSTD_BUILD_COMMAND BUILD_COMMAND ${CMAKE_SOURCE_DIR}/build-support/build-zstd-lib.sh)
+  endif()
+
+  ExternalProject_Add(zstd_ep
+      URL "https://github.com/facebook/zstd/archive/v${ZSTD_VERSION}.tar.gz"
+      UPDATE_COMMAND ""
+      PATCH_COMMAND ""
+      CONFIGURE_COMMAND ""
+      INSTALL_COMMAND ""
+      BINARY_DIR ${ZSTD_BUILD_DIR}
+      ${ZSTD_BUILD_COMMAND}
+      )
+
+  set(ZSTD_VENDORED 1)
+else()
+  set(ZSTD_VENDORED 0)
+endif()
+
+include_directories(SYSTEM ${ZSTD_INCLUDE_DIR})
+ADD_THIRDPARTY_LIB(zstd_static
+  STATIC_LIB ${ZSTD_STATIC_LIB})
+
+if (ZSTD_VENDORED)
+  add_dependencies(zstd_static zstd_ep)
+endif()

http://git-wip-us.apache.org/repos/asf/arrow/blob/cdee23c2/cpp/src/arrow/util/compression-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/compression-test.cc b/cpp/src/arrow/util/compression-test.cc
index 1a0e5d7..3b19a6d 100644
--- a/cpp/src/arrow/util/compression-test.cc
+++ b/cpp/src/arrow/util/compression-test.cc
@@ -86,4 +86,8 @@ TEST(TestCompressors, GZip) {
   CheckCodec<GZipCodec>();
 }
 
+TEST(TestCompressors, ZSTD) {
+  CheckCodec<ZSTDCodec>();
+}
+
 }  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/cdee23c2/cpp/src/arrow/util/compression.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/compression.cc b/cpp/src/arrow/util/compression.cc
index 070f857..df1afa3 100644
--- a/cpp/src/arrow/util/compression.cc
+++ b/cpp/src/arrow/util/compression.cc
@@ -31,6 +31,7 @@
 #include <brotli/encode.h>
 #include <snappy.h>
 #include <zlib.h>
+#include <zstd.h>
 
 #include "arrow/status.h"
 #include "arrow/util/logging.h"
@@ -329,4 +330,31 @@ Status BrotliCodec::Compress(int64_t input_len, const uint8_t* input,
   return Status::OK();
 }
 
+// ----------------------------------------------------------------------
+// ZSTD implementation
+
+Status ZSTDCodec::Decompress(
+    int64_t input_len, const uint8_t* input, int64_t output_len, uint8_t* output_buffer) {
+  int64_t decompressed_size = ZSTD_decompress(output_buffer,
+      static_cast<size_t>(output_len), input, static_cast<size_t>(input_len));
+  if (decompressed_size != output_len) {
+    return Status::IOError("Corrupt ZSTD compressed data.");
+  }
+  return Status::OK();
+}
+
+int64_t ZSTDCodec::MaxCompressedLen(int64_t input_len, const uint8_t* input) {
+  return ZSTD_compressBound(input_len);
+}
+
+Status ZSTDCodec::Compress(int64_t input_len, const uint8_t* input,
+    int64_t output_buffer_len, uint8_t* output_buffer, int64_t* output_length) {
+  *output_length = ZSTD_compress(output_buffer, static_cast<size_t>(output_buffer_len),
+      input, static_cast<size_t>(input_len), 1);
+  if (ZSTD_isError(*output_length)) {
+    return Status::IOError("ZSTD compression failure.");
+  }
+  return Status::OK();
+}
+
 }  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/cdee23c2/cpp/src/arrow/util/compression.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/compression.h b/cpp/src/arrow/util/compression.h
index 6886d04..9e581d8 100644
--- a/cpp/src/arrow/util/compression.h
+++ b/cpp/src/arrow/util/compression.h
@@ -27,7 +27,7 @@
 namespace arrow {
 
 struct Compression {
-  enum type { UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI };
+  enum type { UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, ZSTD };
 };
 
 class ARROW_EXPORT Codec {
@@ -104,6 +104,20 @@ class ARROW_EXPORT GZipCodec : public Codec {
   std::unique_ptr<GZipCodecImpl> impl_;
 };
 
+// ZSTD codec.
+class ARROW_EXPORT ZSTDCodec : public Codec {
+ public:
+  Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_len,
+      uint8_t* output_buffer) override;
+
+  Status Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len,
+      uint8_t* output_buffer, int64_t* output_length) override;
+
+  int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override;
+
+  const char* name() const override { return "zstd"; }
+};
+
 }  // namespace arrow
 
 #endif

http://git-wip-us.apache.org/repos/asf/arrow/blob/cdee23c2/python/manylinux1/Dockerfile-x86_64_base
----------------------------------------------------------------------
diff --git a/python/manylinux1/Dockerfile-x86_64_base b/python/manylinux1/Dockerfile-x86_64_base
index 44a9888..cdd13e2 100644
--- a/python/manylinux1/Dockerfile-x86_64_base
+++ b/python/manylinux1/Dockerfile-x86_64_base
@@ -49,6 +49,14 @@ ADD scripts/build_snappy.sh /
 RUN /build_snappy.sh
 ENV SNAPPY_HOME /usr
 
+ADD scripts/build_lz4.sh /
+RUN /build_lz4.sh
+ENV LZ4_HOME /usr
+
+ADD scripts/build_zstd.sh /
+RUN /build_zstd.sh
+ENV ZSTD_HOME /usr
+
 ADD scripts/build_ccache.sh /
 RUN /build_ccache.sh
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/cdee23c2/python/manylinux1/scripts/build_lz4.sh
----------------------------------------------------------------------
diff --git a/python/manylinux1/scripts/build_lz4.sh b/python/manylinux1/scripts/build_lz4.sh
new file mode 100755
index 0000000..5a25d3d
--- /dev/null
+++ b/python/manylinux1/scripts/build_lz4.sh
@@ -0,0 +1,24 @@
+#!/bin/bash -ex
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License. See accompanying LICENSE file.
+
+export LZ4_VERSION="1.7.5"
+export PREFIX="/usr"
+export LDFLAGS="${LDFLAGS} -Wl,-rpath,${PREFIX}/lib -L${PREFIX}/lib"
+wget "https://github.com/lz4/lz4/archive/v${LZ4_VERSION}.tar.gz" -O lz4-${LZ4_VERSION}.tar.gz
+tar xf lz4-${LZ4_VERSION}.tar.gz
+pushd lz4-${LZ4_VERSION}
+
+make -j5 PREFIX=${PREFIX}
+make install PREFIX=$PREFIX
+popd
+rm -rf lz4-${LZ4_VERSION}.tar.gz lz4-${LZ4_VERSION}

http://git-wip-us.apache.org/repos/asf/arrow/blob/cdee23c2/python/manylinux1/scripts/build_zstd.sh
----------------------------------------------------------------------
diff --git a/python/manylinux1/scripts/build_zstd.sh b/python/manylinux1/scripts/build_zstd.sh
new file mode 100755
index 0000000..268e5c8
--- /dev/null
+++ b/python/manylinux1/scripts/build_zstd.sh
@@ -0,0 +1,25 @@
+#!/bin/bash -ex
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License. See accompanying LICENSE file.
+
+export ZSTD_VERSION="1.2.0"
+export CFLAGS="${CFLAGS} -O3 -fPIC"
+export PREFIX="/usr"
+export LDFLAGS="${LDFLAGS} -Wl,-rpath,${PREFIX}/lib"
+wget "https://github.com/facebook/zstd/archive/v${ZSTD_VERSION}.tar.gz" -O zstd-${ZSTD_VERSION}.tar.gz
+tar xf zstd-${ZSTD_VERSION}.tar.gz
+pushd zstd-${ZSTD_VERSION}
+
+make -j5
+make install PREFIX=$PREFIX
+popd
+rm -rf zstd-${ZSTD_VERSION}.tar.gz zstd-${ZSTD_VERSION}