You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/09/27 12:19:59 UTC

[arrow] 20/24: PARQUET-1196: Example parquet_arrow project

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 3a9dbdf467382f380eef6d36c50f2c302bb07ca0
Author: Uwe L. Korn <uw...@xhochy.com>
AuthorDate: Thu Feb 15 18:42:22 2018 +0100

    PARQUET-1196: Example parquet_arrow project
    
    Depends on https://github.com/apache/parquet-cpp/pull/434
    
    Author: Uwe L. Korn <uw...@xhochy.com>
    Author: Korn, Uwe <Uw...@blue-yonder.com>
    
    Closes #436 from xhochy/PARQUET-1196 and squashes the following commits:
    
    a938da7 [Uwe L. Korn] Check Status for PrettyPrint
    15d62f3 [Uwe L. Korn] PARQUET-1196: Example parquet_arrow project
    1280fd5 [Korn, Uwe] PARQUET-1200: Support reading a single Arrow column from a Parquet file
    
    Change-Id: I907f2276b319491f6e02117f4a21ab2383006a99
---
 .../parquet/{ => low-level-api}/CMakeLists.txt     |   0
 .../parquet/{ => low-level-api}/reader-writer.cc   |   0
 cpp/examples/parquet/parquet-arrow/CMakeLists.txt  |  78 +++++++++++
 cpp/examples/parquet/parquet-arrow/README.md       |  20 +++
 .../cmake_modules/ArrowExternalProject.cmake       |   1 +
 .../parquet-arrow/cmake_modules/FindArrow.cmake    |   1 +
 .../parquet-arrow/cmake_modules/FindParquet.cmake  | 145 +++++++++++++++++++++
 .../parquet/parquet-arrow/src/reader-writer.cc     | 134 +++++++++++++++++++
 8 files changed, 379 insertions(+)

diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/low-level-api/CMakeLists.txt
similarity index 100%
rename from cpp/examples/parquet/CMakeLists.txt
rename to cpp/examples/parquet/low-level-api/CMakeLists.txt
diff --git a/cpp/examples/parquet/reader-writer.cc b/cpp/examples/parquet/low-level-api/reader-writer.cc
similarity index 100%
rename from cpp/examples/parquet/reader-writer.cc
rename to cpp/examples/parquet/low-level-api/reader-writer.cc
diff --git a/cpp/examples/parquet/parquet-arrow/CMakeLists.txt b/cpp/examples/parquet/parquet-arrow/CMakeLists.txt
new file mode 100644
index 0000000..897fcfb
--- /dev/null
+++ b/cpp/examples/parquet/parquet-arrow/CMakeLists.txt
@@ -0,0 +1,78 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Require cmake that supports BYPRODUCTS in add_custom_command, ExternalProject_Add [1].
+cmake_minimum_required(VERSION 3.2.0)
+
+project(parquet-arrow-example)
+
+include(ExternalProject)
+include(FindPkgConfig)
+include(GNUInstallDirs)
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake_modules")
+
+# This ensures that things like gnu++11 get passed correctly
+set(CMAKE_CXX_STANDARD 11)
+
+# We require a C++11 compliant compiler
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# We want to link dynamically against Arrow and Parquet
+set(PARQUET_BUILD_SHARED ON)
+
+
+# First search the packages in the system. If they are not found, use CMake's
+# ExternalProject mechanism to build them locally.
+find_package(Arrow)
+if (NOT ARROW_FOUND)
+  # set compile output directory
+  if (NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Debug)
+  endif(NOT CMAKE_BUILD_TYPE)
+  string (TOLOWER ${CMAKE_BUILD_TYPE} BUILD_SUBDIR_NAME)
+  # If build in-source, create the latest symlink. If build out-of-source, which is
+  # preferred, simply output the binaries in the build folder
+  if (${CMAKE_SOURCE_DIR} STREQUAL "${CMAKE_CURRENT_BINARY_DIR}")
+    set(BUILD_OUTPUT_ROOT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/build/${BUILD_SUBDIR_NAME}")
+    # Link build/latest to the current build directory, to avoid developers
+    # accidentally running the latest debug build when in fact they're building
+    # release builds.
+    FILE(MAKE_DIRECTORY ${BUILD_OUTPUT_ROOT_DIRECTORY})
+    if (NOT APPLE)
+      set(MORE_ARGS "-T")
+    endif()
+  EXECUTE_PROCESS(COMMAND ln ${MORE_ARGS} -sf ${BUILD_OUTPUT_ROOT_DIRECTORY}
+    ${CMAKE_CURRENT_BINARY_DIR}/build/latest)
+  else()
+    set(BUILD_OUTPUT_ROOT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${BUILD_SUBDIR_NAME}")
+  endif()
+
+  include(ArrowExternalProject)
+  set(ARROW_VENDORED 1)
+else()
+  set(ARROW_VENDORED 0)
+endif()
+find_package(Parquet)
+
+include_directories(SYSTEM ${ARROW_INCLUDE_DIR} ${PARQUET_INCLUDE_DIR})
+
+add_executable(parquet-arrow-reader-writer src/reader-writer.cc)
+target_link_libraries(parquet-arrow-reader-writer ${PARQUET_SHARED_LIB} ${ARROW_SHARED_LIB})
+if (ARROW_VENDORED)
+  add_dependencies(parquet-arrow-reader-writer arrow_ep)
+endif()
diff --git a/cpp/examples/parquet/parquet-arrow/README.md b/cpp/examples/parquet/parquet-arrow/README.md
new file mode 100644
index 0000000..e99819f
--- /dev/null
+++ b/cpp/examples/parquet/parquet-arrow/README.md
@@ -0,0 +1,20 @@
+<!---
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+Using parquet-cpp with the arrow interface
+==========================================
+
+This folder contains an example project that shows how to setup a CMake project
+that consumes `parquet-cpp` as a library as well as how you can use the
+`parquet/arrow` interface to reading and write Apache Parquet files.
diff --git a/cpp/examples/parquet/parquet-arrow/cmake_modules/ArrowExternalProject.cmake b/cpp/examples/parquet/parquet-arrow/cmake_modules/ArrowExternalProject.cmake
new file mode 120000
index 0000000..b535f6e
--- /dev/null
+++ b/cpp/examples/parquet/parquet-arrow/cmake_modules/ArrowExternalProject.cmake
@@ -0,0 +1 @@
+../../../cmake_modules/ArrowExternalProject.cmake
\ No newline at end of file
diff --git a/cpp/examples/parquet/parquet-arrow/cmake_modules/FindArrow.cmake b/cpp/examples/parquet/parquet-arrow/cmake_modules/FindArrow.cmake
new file mode 120000
index 0000000..6c451ce
--- /dev/null
+++ b/cpp/examples/parquet/parquet-arrow/cmake_modules/FindArrow.cmake
@@ -0,0 +1 @@
+../../../cmake_modules/FindArrow.cmake
\ No newline at end of file
diff --git a/cpp/examples/parquet/parquet-arrow/cmake_modules/FindParquet.cmake b/cpp/examples/parquet/parquet-arrow/cmake_modules/FindParquet.cmake
new file mode 100644
index 0000000..8bbe05f
--- /dev/null
+++ b/cpp/examples/parquet/parquet-arrow/cmake_modules/FindParquet.cmake
@@ -0,0 +1,145 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# - Find PARQUET (parquet/parquet.h, libparquet.a, libparquet.so)
+# This module defines
+#  PARQUET_INCLUDE_DIR, directory containing headers
+#  PARQUET_LIBS, directory containing parquet libraries
+#  PARQUET_STATIC_LIB, path to libparquet.a
+#  PARQUET_SHARED_LIB, path to libparquet's shared library
+#  PARQUET_SHARED_IMP_LIB, path to libparquet's import library (MSVC only)
+#  PARQUET_FOUND, whether parquet has been found
+
+include(FindPkgConfig)
+
+if(NOT "$ENV{PARQUET_HOME}" STREQUAL "")
+    set(PARQUET_HOME "$ENV{PARQUET_HOME}")
+endif()
+
+if (MSVC)
+  SET(CMAKE_FIND_LIBRARY_SUFFIXES ".lib" ".dll")
+
+  if (MSVC AND NOT PARQUET_MSVC_STATIC_LIB_SUFFIX)
+    set(PARQUET_MSVC_STATIC_LIB_SUFFIX "_static")
+  endif()
+
+  find_library(PARQUET_SHARED_LIBRARIES NAMES parquet
+    PATHS ${PARQUET_HOME} NO_DEFAULT_PATH
+    PATH_SUFFIXES "bin" )
+
+  get_filename_component(PARQUET_SHARED_LIBS ${PARQUET_SHARED_LIBRARIES} PATH )
+endif ()
+
+if(PARQUET_HOME)
+    set(PARQUET_SEARCH_HEADER_PATHS
+        ${PARQUET_HOME}/include
+        )
+    set(PARQUET_SEARCH_LIB_PATH
+        ${PARQUET_HOME}/lib
+        )
+    find_path(PARQUET_INCLUDE_DIR parquet/api/reader.h PATHS
+        ${PARQUET_SEARCH_HEADER_PATHS}
+        # make sure we don't accidentally pick up a different version
+        NO_DEFAULT_PATH
+        )
+    find_library(PARQUET_LIBRARIES NAMES parquet
+        PATHS ${PARQUET_HOME} NO_DEFAULT_PATH
+        PATH_SUFFIXES "lib")
+    get_filename_component(PARQUET_LIBS ${PARQUET_LIBRARIES} PATH )
+
+    # Try to autodiscover the Parquet ABI version
+    get_filename_component(PARQUET_LIB_REALPATH ${PARQUET_LIBRARIES} REALPATH)
+    get_filename_component(PARQUET_EXT_REALPATH ${PARQUET_LIB_REALPATH} EXT)
+    string(REGEX MATCH ".([0-9]+.[0-9]+.[0-9]+)" HAS_ABI_VERSION ${PARQUET_EXT_REALPATH})
+    if (HAS_ABI_VERSION)
+      if (APPLE)
+        string(REGEX REPLACE ".([0-9]+.[0-9]+.[0-9]+).dylib" "\\1" PARQUET_ABI_VERSION ${PARQUET_EXT_REALPATH})
+      else()
+        string(REGEX REPLACE ".so.([0-9]+.[0-9]+.[0-9]+)" "\\1" PARQUET_ABI_VERSION ${PARQUET_EXT_REALPATH})
+      endif()
+      string(REGEX REPLACE "([0-9]+).[0-9]+.[0-9]+" "\\1" PARQUET_SO_VERSION ${PARQUET_ABI_VERSION})
+    else()
+      set(PARQUET_ABI_VERSION "1.0.0")
+      set(PARQUET_SO_VERSION "1")
+    endif()
+else()
+    pkg_check_modules(PARQUET parquet)
+    if (PARQUET_FOUND)
+        pkg_get_variable(PARQUET_ABI_VERSION parquet abi_version)
+        message(STATUS "Parquet C++ ABI version: ${PARQUET_ABI_VERSION}")
+        pkg_get_variable(PARQUET_SO_VERSION parquet so_version)
+        message(STATUS "Parquet C++ SO version: ${PARQUET_SO_VERSION}")
+        set(PARQUET_INCLUDE_DIR ${PARQUET_INCLUDE_DIRS})
+        set(PARQUET_LIBS ${PARQUET_LIBRARY_DIRS})
+        set(PARQUET_SEARCH_LIB_PATH ${PARQUET_LIBRARY_DIRS})
+        message(STATUS "Searching for parquet libs in: ${PARQUET_SEARCH_LIB_PATH}")
+        find_library(PARQUET_LIBRARIES NAMES parquet
+            PATHS ${PARQUET_SEARCH_LIB_PATH} NO_DEFAULT_PATH)
+    else()
+        find_path(PARQUET_INCLUDE_DIR NAMES parquet/api/reader.h )
+        find_library(PARQUET_LIBRARIES NAMES parquet)
+        get_filename_component(PARQUET_LIBS ${PARQUET_LIBRARIES} PATH )
+    endif()
+endif()
+
+if (PARQUET_INCLUDE_DIR AND PARQUET_LIBRARIES)
+  set(PARQUET_FOUND TRUE)
+  set(PARQUET_LIB_NAME parquet)
+  if (MSVC)
+    set(PARQUET_STATIC_LIB "${PARQUET_LIBS}/${PARQUET_LIB_NAME}${PARQUET_MSVC_STATIC_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+    set(PARQUET_SHARED_LIB "${PARQUET_SHARED_LIBS}/${PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}")
+    set(PARQUET_SHARED_IMP_LIB "${PARQUET_LIBS}/${PARQUET_LIB_NAME}.lib")
+  else()
+    set(PARQUET_STATIC_LIB ${PARQUET_LIBS}/${CMAKE_STATIC_LIBRARY_PREFIX}${PARQUET_LIB_NAME}.a)
+    set(PARQUET_SHARED_LIB ${PARQUET_LIBS}/${CMAKE_SHARED_LIBRARY_PREFIX}${PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
+  endif()
+else ()
+  set(PARQUET_FOUND FALSE)
+endif ()
+
+if (PARQUET_FOUND)
+  if (NOT Parquet_FIND_QUIETLY)
+    message(STATUS "Found the Parquet library: ${PARQUET_LIBRARIES}")
+  endif ()
+else ()
+  if (NOT Parquet_FIND_QUIETLY)
+    if (NOT PARQUET_FOUND)
+      set(PARQUET_ERR_MSG "${PARQUET_ERR_MSG} Could not find the parquet library.")
+    endif()
+
+    set(PARQUET_ERR_MSG "${PARQUET_ERR_MSG} Looked in ")
+    if ( _parquet_roots )
+      set(PARQUET_ERR_MSG "${PARQUET_ERR_MSG} in ${_parquet_roots}.")
+    else ()
+      set(PARQUET_ERR_MSG "${PARQUET_ERR_MSG} system search paths.")
+    endif ()
+    if (Parquet_FIND_REQUIRED)
+      message(FATAL_ERROR "${PARQUET_ERR_MSG}")
+    else (Parquet_FIND_REQUIRED)
+      message(STATUS "${PARQUET_ERR_MSG}")
+    endif (Parquet_FIND_REQUIRED)
+  endif ()
+endif ()
+
+mark_as_advanced(
+  PARQUET_FOUND
+  PARQUET_INCLUDE_DIR
+  PARQUET_LIBS
+  PARQUET_LIBRARIES
+  PARQUET_STATIC_LIB
+  PARQUET_SHARED_LIB
+)
diff --git a/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc b/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc
new file mode 100644
index 0000000..f333cab
--- /dev/null
+++ b/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc
@@ -0,0 +1,134 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/api.h>
+#include <arrow/io/api.h>
+#include <parquet/arrow/reader.h>
+#include <parquet/arrow/writer.h>
+#include <parquet/exception.h>
+
+// #0 Build dummy data to pass around
+// To have some input data, we first create an Arrow Table that holds
+// some data.
+std::shared_ptr<arrow::Table> generate_table() {
+  arrow::Int64Builder i64builder;
+  PARQUET_THROW_NOT_OK(i64builder.Append({1, 2, 3, 4, 5}));
+  std::shared_ptr<arrow::Array> i64array;
+  PARQUET_THROW_NOT_OK(i64builder.Finish(&i64array));
+
+  arrow::StringBuilder strbuilder;
+  PARQUET_THROW_NOT_OK(strbuilder.Append("some"));
+  PARQUET_THROW_NOT_OK(strbuilder.Append("string"));
+  PARQUET_THROW_NOT_OK(strbuilder.Append("content"));
+  PARQUET_THROW_NOT_OK(strbuilder.Append("in"));
+  PARQUET_THROW_NOT_OK(strbuilder.Append("rows"));
+  std::shared_ptr<arrow::Array> strarray;
+  PARQUET_THROW_NOT_OK(strbuilder.Finish(&strarray));
+
+  std::shared_ptr<arrow::Schema> schema = arrow::schema(
+      {arrow::field("int", arrow::int64()), arrow::field("str", arrow::utf8())});
+
+  return arrow::Table::Make(schema, {i64array, strarray});
+}
+
+// #1 Write out the data as a Parquet file
+void write_parquet_file(const arrow::Table& table) {
+  std::shared_ptr<arrow::io::FileOutputStream> outfile;
+  PARQUET_THROW_NOT_OK(
+      arrow::io::FileOutputStream::Open("parquet-arrow-example.parquet", &outfile));
+  // The last argument to the function call is the size of the RowGroup in
+  // the parquet file. Normally you would choose this to be rather large but
+  // for the example, we use a small value to have multiple RowGroups.
+  PARQUET_THROW_NOT_OK(
+      parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3));
+}
+
+// #2: Fully read in the file
+void read_whole_file() {
+  std::cout << "Reading parquet-arrow-example.parquet at once" << std::endl;
+  std::shared_ptr<arrow::io::ReadableFile> infile;
+  PARQUET_THROW_NOT_OK(arrow::io::ReadableFile::Open(
+      "parquet-arrow-example.parquet", arrow::default_memory_pool(), &infile));
+
+  std::unique_ptr<parquet::arrow::FileReader> reader;
+  PARQUET_THROW_NOT_OK(
+      parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+  std::shared_ptr<arrow::Table> table;
+  PARQUET_THROW_NOT_OK(reader->ReadTable(&table));
+  std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
+            << " columns." << std::endl;
+}
+
+// #3: Read only a single RowGroup of the parquet file
+void read_single_rowgroup() {
+  std::cout << "Reading first RowGroup of parquet-arrow-example.parquet" << std::endl;
+  std::shared_ptr<arrow::io::ReadableFile> infile;
+  PARQUET_THROW_NOT_OK(arrow::io::ReadableFile::Open(
+      "parquet-arrow-example.parquet", arrow::default_memory_pool(), &infile));
+
+  std::unique_ptr<parquet::arrow::FileReader> reader;
+  PARQUET_THROW_NOT_OK(
+      parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+  std::shared_ptr<arrow::Table> table;
+  PARQUET_THROW_NOT_OK(reader->RowGroup(0)->ReadTable(&table));
+  std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
+            << " columns." << std::endl;
+}
+
+// #4: Read only a single column of the whole parquet file
+void read_single_column() {
+  std::cout << "Reading first column of parquet-arrow-example.parquet" << std::endl;
+  std::shared_ptr<arrow::io::ReadableFile> infile;
+  PARQUET_THROW_NOT_OK(arrow::io::ReadableFile::Open(
+      "parquet-arrow-example.parquet", arrow::default_memory_pool(), &infile));
+
+  std::unique_ptr<parquet::arrow::FileReader> reader;
+  PARQUET_THROW_NOT_OK(
+      parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+  std::shared_ptr<arrow::Array> array;
+  PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array));
+  PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout));
+  std::cout << std::endl;
+}
+
+// #5: Read only a single column of a RowGroup (this is known as ColumnChunk)
+//     from the Parquet file.
+void read_single_column_chunk() {
+  std::cout << "Reading first ColumnChunk of the first RowGroup of "
+               "parquet-arrow-example.parquet"
+            << std::endl;
+  std::shared_ptr<arrow::io::ReadableFile> infile;
+  PARQUET_THROW_NOT_OK(arrow::io::ReadableFile::Open(
+      "parquet-arrow-example.parquet", arrow::default_memory_pool(), &infile));
+
+  std::unique_ptr<parquet::arrow::FileReader> reader;
+  PARQUET_THROW_NOT_OK(
+      parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+  std::shared_ptr<arrow::Array> array;
+  PARQUET_THROW_NOT_OK(reader->RowGroup(0)->Column(0)->Read(&array));
+  PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout));
+  std::cout << std::endl;
+}
+
+int main(int argc, char** argv) {
+  std::shared_ptr<arrow::Table> table = generate_table();
+  write_parquet_file(*table);
+  read_whole_file();
+  read_single_rowgroup();
+  read_single_column();
+  read_single_column_chunk();
+}