You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2016/12/23 22:29:28 UTC
[2/2] arrow git commit: ARROW-432: [Python] Construct precise pandas
BlockManager structure for zero-copy DataFrame initialization
ARROW-432: [Python] Construct precise pandas BlockManager structure for zero-copy DataFrame initialization
This avoids "memory tripling" (because pd.DataFrame will often immediately consolidate the arrays) and also will allow the Arrow->pandas deserialization to be parallelized for further performance gains.
@xhochy this also has the effect of coercing all timestamps to `datetime[ns]` -- for pandas I believe this is the proper behavior, but wanted to run it by you.
In a local benchmark on roughly 1GB of data I have:
setup code:
```python
import numpy as np
import pandas as pd
import pyarrow as pa
DATA_SIZE = (1 << 30)
NCOLS = 100
data = np.random.randn(NCOLS, DATA_SIZE / NCOLS / 8).T
data[::2] = np.nan
df = pd.DataFrame(data, columns=['c' + str(i) for i in range(NCOLS)])
table = pa.Table.from_pandas(df)
```
before these changes (I added `block_based` argument to toggle this code path off):
```python
In [4]: %timeit df2 = table.to_pandas(block_based=False)
1 loop, best of 3: 252 ms per loop
```
```python
In [5]: %timeit df2 = table.to_pandas()
10 loops, best of 3: 139 ms per loop
```
This takes the effective in-memory bandwidth on numerical data from 3.97 GB/s to 7.19 GB/s.
I also moved the clang-format files to the top level so we can more easily run code formatting on the C++ code under python/.
Author: Wes McKinney <we...@twosigma.com>
Closes #251 from wesm/ARROW-432 and squashes the following commits:
f22e1b5 [Wes McKinney] Remove unneeded import
ea83ded [Wes McKinney] Unit tests pass again
ec239b8 [Wes McKinney] Fix DataFrame constructions, code formatting
af960ee [Wes McKinney] Draft blocks -> DataFrame scaffold
110692f [Wes McKinney] First draft of scaffolding for creating precise pandas.DataFrame block structure
4928a0b [Wes McKinney] Refactor post rebase
c89cfaf [Wes McKinney] Rearrange to-pandas deserialization to better permit reads into pre-allocated memory
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/65af9ea1
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/65af9ea1
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/65af9ea1
Branch: refs/heads/master
Commit: 65af9ea16a3c9241a66203b57cbfe2041a5ee52b
Parents: fd4eb98
Author: Wes McKinney <we...@twosigma.com>
Authored: Fri Dec 23 17:29:17 2016 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Fri Dec 23 17:29:17 2016 -0500
----------------------------------------------------------------------
.clang-format | 65 ++
.clang-tidy | 14 +
.clang-tidy-ignore | 2 +
cpp/CMakeLists.txt | 3 +-
cpp/src/.clang-format | 65 --
cpp/src/.clang-tidy | 14 -
cpp/src/.clang-tidy-ignore | 2 -
python/pyarrow/includes/pyarrow.pxd | 5 +-
python/pyarrow/table.pyx | 54 +-
python/src/pyarrow/adapters/builtin.cc | 66 +-
python/src/pyarrow/adapters/builtin.h | 4 +-
python/src/pyarrow/adapters/pandas.cc | 1184 +++++++++++++++++++--------
python/src/pyarrow/adapters/pandas.h | 33 +-
python/src/pyarrow/api.h | 2 +-
python/src/pyarrow/common.cc | 4 +-
python/src/pyarrow/common.h | 52 +-
python/src/pyarrow/config.cc | 5 +-
python/src/pyarrow/config.h | 6 +-
python/src/pyarrow/helpers.cc | 37 +-
python/src/pyarrow/helpers.h | 4 +-
python/src/pyarrow/io.cc | 14 +-
python/src/pyarrow/io.h | 6 +-
python/src/pyarrow/numpy_interop.h | 4 +-
python/src/pyarrow/util/datetime.h | 8 +-
python/src/pyarrow/util/test_main.cc | 2 +-
25 files changed, 1067 insertions(+), 588 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/.clang-format
----------------------------------------------------------------------
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..7d5b3cf
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,65 @@
+---
+Language: Cpp
+# BasedOnStyle: Google
+AccessModifierOffset: -1
+AlignAfterOpenBracket: false
+AlignConsecutiveAssignments: false
+AlignEscapedNewlinesLeft: true
+AlignOperands: true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Inline
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: true
+BinPackParameters: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+ColumnLimit: 90
+CommentPragmas: '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat: false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IndentCaseLabels: true
+IndentWidth: 2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1000
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp11
+TabWidth: 8
+UseTab: Never
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/.clang-tidy
----------------------------------------------------------------------
diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 0000000..deaa9bd
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,14 @@
+---
+Checks: 'clang-diagnostic-*,clang-analyzer-*,-clang-analyzer-alpha*,google-.*,modernize-.*,readablity-.*'
+HeaderFilterRegex: 'arrow/.*'
+AnalyzeTemporaryDtors: true
+CheckOptions:
+ - key: google-readability-braces-around-statements.ShortStatementLines
+ value: '1'
+ - key: google-readability-function-size.StatementThreshold
+ value: '800'
+ - key: google-readability-namespace-comments.ShortNamespaceLines
+ value: '10'
+ - key: google-readability-namespace-comments.SpacesBeforeComments
+ value: '2'
+
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/.clang-tidy-ignore
----------------------------------------------------------------------
diff --git a/.clang-tidy-ignore b/.clang-tidy-ignore
new file mode 100644
index 0000000..5ab4d20
--- /dev/null
+++ b/.clang-tidy-ignore
@@ -0,0 +1,2 @@
+ipc-adapter-test.cc
+memory-pool-test.cc
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/cpp/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 315995c..93e9853 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -741,7 +741,8 @@ endif (UNIX)
if (${CLANG_FORMAT_FOUND})
# runs clang format and updates files in place.
add_custom_target(format ${BUILD_SUPPORT_DIR}/run-clang-format.sh ${CMAKE_CURRENT_SOURCE_DIR} ${CLANG_FORMAT_BIN} 1
- `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`)
+ `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`
+ `find ${CMAKE_CURRENT_SOURCE_DIR}/../python -name \\*.cc -or -name \\*.h`)
# runs clang format and exits with a non-zero exit code if any files need to be reformatted
add_custom_target(check-format ${BUILD_SUPPORT_DIR}/run-clang-format.sh ${CMAKE_CURRENT_SOURCE_DIR} ${CLANG_FORMAT_BIN} 0
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/cpp/src/.clang-format
----------------------------------------------------------------------
diff --git a/cpp/src/.clang-format b/cpp/src/.clang-format
deleted file mode 100644
index 7d5b3cf..0000000
--- a/cpp/src/.clang-format
+++ /dev/null
@@ -1,65 +0,0 @@
----
-Language: Cpp
-# BasedOnStyle: Google
-AccessModifierOffset: -1
-AlignAfterOpenBracket: false
-AlignConsecutiveAssignments: false
-AlignEscapedNewlinesLeft: true
-AlignOperands: true
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: true
-AllowShortCaseLabelsOnASingleLine: false
-AllowShortFunctionsOnASingleLine: Inline
-AllowShortIfStatementsOnASingleLine: true
-AllowShortLoopsOnASingleLine: false
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: true
-BinPackArguments: true
-BinPackParameters: true
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Attach
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-ColumnLimit: 90
-CommentPragmas: '^ IWYU pragma:'
-ConstructorInitializerAllOnOneLineOrOnePerLine: true
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-Cpp11BracedListStyle: true
-DerivePointerAlignment: false
-DisableFormat: false
-ExperimentalAutoDetectBinPacking: false
-ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
-IndentCaseLabels: true
-IndentWidth: 2
-IndentWrappedFunctionNames: false
-KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: ''
-MacroBlockEnd: ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: false
-PenaltyBreakBeforeFirstCallParameter: 1000
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
-PointerAlignment: Left
-SpaceAfterCStyleCast: false
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeParens: ControlStatements
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 2
-SpacesInAngles: false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard: Cpp11
-TabWidth: 8
-UseTab: Never
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/cpp/src/.clang-tidy
----------------------------------------------------------------------
diff --git a/cpp/src/.clang-tidy b/cpp/src/.clang-tidy
deleted file mode 100644
index deaa9bd..0000000
--- a/cpp/src/.clang-tidy
+++ /dev/null
@@ -1,14 +0,0 @@
----
-Checks: 'clang-diagnostic-*,clang-analyzer-*,-clang-analyzer-alpha*,google-.*,modernize-.*,readablity-.*'
-HeaderFilterRegex: 'arrow/.*'
-AnalyzeTemporaryDtors: true
-CheckOptions:
- - key: google-readability-braces-around-statements.ShortStatementLines
- value: '1'
- - key: google-readability-function-size.StatementThreshold
- value: '800'
- - key: google-readability-namespace-comments.ShortNamespaceLines
- value: '10'
- - key: google-readability-namespace-comments.SpacesBeforeComments
- value: '2'
-
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/cpp/src/.clang-tidy-ignore
----------------------------------------------------------------------
diff --git a/cpp/src/.clang-tidy-ignore b/cpp/src/.clang-tidy-ignore
deleted file mode 100644
index 5ab4d20..0000000
--- a/cpp/src/.clang-tidy-ignore
+++ /dev/null
@@ -1,2 +0,0 @@
-ipc-adapter-test.cc
-memory-pool-test.cc
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/pyarrow/includes/pyarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd
index a5444c2..dc6ccd2 100644
--- a/python/pyarrow/includes/pyarrow.pxd
+++ b/python/pyarrow/includes/pyarrow.pxd
@@ -18,7 +18,7 @@
# distutils: language = c++
from pyarrow.includes.common cimport *
-from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn,
+from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CTable,
CDataType, CStatus, Type, MemoryPool)
cimport pyarrow.includes.libarrow_io as arrow_io
@@ -39,6 +39,9 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil:
CStatus ConvertColumnToPandas(const shared_ptr[CColumn]& arr,
PyObject* py_ref, PyObject** out)
+ CStatus ConvertTableToPandas(const shared_ptr[CTable]& table,
+ int nthreads, PyObject** out)
+
MemoryPool* get_memory_pool()
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/pyarrow/table.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx
index 2f7d430..9375557 100644
--- a/python/pyarrow/table.pyx
+++ b/python/pyarrow/table.pyx
@@ -430,6 +430,32 @@ cdef class RecordBatch:
return result
+cdef table_to_blockmanager(const shared_ptr[CTable]& table, int nthreads):
+ cdef:
+ PyObject* result_obj
+ CColumn* col
+ int i
+
+ from pandas.core.internals import BlockManager, make_block
+ from pandas import RangeIndex
+
+ check_status(pyarrow.ConvertTableToPandas(table, nthreads, &result_obj))
+
+ result = PyObject_to_object(result_obj)
+
+ blocks = []
+ for block_arr, placement_arr in result:
+ blocks.append(make_block(block_arr, placement=placement_arr))
+
+ names = []
+ for i in range(table.get().num_columns()):
+ col = table.get().column(i).get()
+ names.append(frombytes(col.name()))
+
+ axes = [names, RangeIndex(table.get().num_rows())]
+ return BlockManager(blocks, axes)
+
+
cdef class Table:
"""
A collection of top-level named, equal length Arrow arrays.
@@ -584,7 +610,7 @@ cdef class Table:
table.init(c_table)
return table
- def to_pandas(self):
+ def to_pandas(self, nthreads=1, block_based=True):
"""
Convert the arrow::Table to a pandas DataFrame
@@ -599,17 +625,21 @@ cdef class Table:
import pandas as pd
- names = []
- data = []
- for i in range(self.table.num_columns()):
- col = self.table.column(i)
- column = self.column(i)
- check_status(pyarrow.ConvertColumnToPandas(
- col, <PyObject*> column, &arr))
- names.append(frombytes(col.get().name()))
- data.append(PyObject_to_object(arr))
-
- return pd.DataFrame(dict(zip(names, data)), columns=names)
+ if block_based:
+ mgr = table_to_blockmanager(self.sp_table, nthreads)
+ return pd.DataFrame(mgr)
+ else:
+ names = []
+ data = []
+ for i in range(self.table.num_columns()):
+ col = self.table.column(i)
+ column = self.column(i)
+ check_status(pyarrow.ConvertColumnToPandas(
+ col, <PyObject*> column, &arr))
+ names.append(frombytes(col.get().name()))
+ data.append(PyObject_to_object(arr))
+
+ return pd.DataFrame(dict(zip(names, data)), columns=names)
@property
def name(self):
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/adapters/builtin.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc
index 2a13944..fb7475f 100644
--- a/python/src/pyarrow/adapters/builtin.cc
+++ b/python/src/pyarrow/adapters/builtin.cc
@@ -44,16 +44,16 @@ static inline bool IsPyInteger(PyObject* obj) {
class ScalarVisitor {
public:
- ScalarVisitor() :
- total_count_(0),
- none_count_(0),
- bool_count_(0),
- int_count_(0),
- date_count_(0),
- timestamp_count_(0),
- float_count_(0),
- binary_count_(0),
- unicode_count_(0) {}
+ ScalarVisitor()
+ : total_count_(0),
+ none_count_(0),
+ bool_count_(0),
+ int_count_(0),
+ date_count_(0),
+ timestamp_count_(0),
+ float_count_(0),
+ binary_count_(0),
+ unicode_count_(0) {}
void Visit(PyObject* obj) {
++total_count_;
@@ -100,9 +100,7 @@ class ScalarVisitor {
}
}
- int64_t total_count() const {
- return total_count_;
- }
+ int64_t total_count() const { return total_count_; }
private:
int64_t total_count_;
@@ -123,17 +121,14 @@ static constexpr int MAX_NESTING_LEVELS = 32;
class SeqVisitor {
public:
- SeqVisitor() :
- max_nesting_level_(0) {
+ SeqVisitor() : max_nesting_level_(0) {
memset(nesting_histogram_, 0, MAX_NESTING_LEVELS * sizeof(int));
}
- Status Visit(PyObject* obj, int level=0) {
+ Status Visit(PyObject* obj, int level = 0) {
Py_ssize_t size = PySequence_Size(obj);
- if (level > max_nesting_level_) {
- max_nesting_level_ = level;
- }
+ if (level > max_nesting_level_) { max_nesting_level_ = level; }
for (int64_t i = 0; i < size; ++i) {
// TODO(wesm): Error checking?
@@ -188,9 +183,7 @@ class SeqVisitor {
int max_observed_level() const {
int result = 0;
for (int i = 0; i < MAX_NESTING_LEVELS; ++i) {
- if (nesting_histogram_[i] > 0) {
- result = i;
- }
+ if (nesting_histogram_[i] > 0) { result = i; }
}
return result;
}
@@ -198,9 +191,7 @@ class SeqVisitor {
int num_nesting_levels() const {
int result = 0;
for (int i = 0; i < MAX_NESTING_LEVELS; ++i) {
- if (nesting_histogram_[i] > 0) {
- ++result;
- }
+ if (nesting_histogram_[i] > 0) { ++result; }
}
return result;
}
@@ -214,8 +205,8 @@ class SeqVisitor {
};
// Non-exhaustive type inference
-static Status InferArrowType(PyObject* obj, int64_t* size,
- std::shared_ptr<DataType>* out_type) {
+static Status InferArrowType(
+ PyObject* obj, int64_t* size, std::shared_ptr<DataType>* out_type) {
*size = PySequence_Size(obj);
if (PyErr_Occurred()) {
// Not a sequence
@@ -224,9 +215,7 @@ static Status InferArrowType(PyObject* obj, int64_t* size,
}
// For 0-length sequences, refuse to guess
- if (*size == 0) {
- *out_type = arrow::null();
- }
+ if (*size == 0) { *out_type = arrow::null(); }
SeqVisitor seq_visitor;
RETURN_NOT_OK(seq_visitor.Visit(obj));
@@ -234,9 +223,7 @@ static Status InferArrowType(PyObject* obj, int64_t* size,
*out_type = seq_visitor.GetType();
- if (*out_type == nullptr) {
- return Status::TypeError("Unable to determine data type");
- }
+ if (*out_type == nullptr) { return Status::TypeError("Unable to determine data type"); }
return Status::OK();
}
@@ -337,7 +324,8 @@ class TimestampConverter : public TypedConverter<arrow::TimestampBuilder> {
if (item.obj() == Py_None) {
typed_builder_->AppendNull();
} else {
- PyDateTime_DateTime* pydatetime = reinterpret_cast<PyDateTime_DateTime*>(item.obj());
+ PyDateTime_DateTime* pydatetime =
+ reinterpret_cast<PyDateTime_DateTime*>(item.obj());
struct tm datetime = {0};
datetime.tm_year = PyDateTime_GET_YEAR(pydatetime) - 1900;
datetime.tm_mon = PyDateTime_GET_MONTH(pydatetime) - 1;
@@ -462,6 +450,7 @@ class ListConverter : public TypedConverter<arrow::ListBuilder> {
}
return Status::OK();
}
+
protected:
std::shared_ptr<SeqConverter> value_converter_;
};
@@ -496,8 +485,8 @@ Status ListConverter::Init(const std::shared_ptr<ArrayBuilder>& builder) {
builder_ = builder;
typed_builder_ = static_cast<arrow::ListBuilder*>(builder.get());
- value_converter_ = GetConverter(static_cast<arrow::ListType*>(
- builder->type().get())->value_type());
+ value_converter_ =
+ GetConverter(static_cast<arrow::ListType*>(builder->type().get())->value_type());
if (value_converter_ == nullptr) {
return Status::NotImplemented("value type not implemented");
}
@@ -521,8 +510,7 @@ Status ConvertPySequence(PyObject* obj, std::shared_ptr<arrow::Array>* out) {
std::shared_ptr<SeqConverter> converter = GetConverter(type);
if (converter == nullptr) {
std::stringstream ss;
- ss << "No type converter implemented for "
- << type->ToString();
+ ss << "No type converter implemented for " << type->ToString();
return Status::NotImplemented(ss.str());
}
@@ -536,4 +524,4 @@ Status ConvertPySequence(PyObject* obj, std::shared_ptr<arrow::Array>* out) {
return builder->Finish(out);
}
-} // namespace pyarrow
+} // namespace pyarrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/adapters/builtin.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/builtin.h b/python/src/pyarrow/adapters/builtin.h
index 2ddfdaa..1ff3694 100644
--- a/python/src/pyarrow/adapters/builtin.h
+++ b/python/src/pyarrow/adapters/builtin.h
@@ -40,6 +40,6 @@ namespace pyarrow {
PYARROW_EXPORT
arrow::Status ConvertPySequence(PyObject* obj, std::shared_ptr<arrow::Array>* out);
-} // namespace pyarrow
+} // namespace pyarrow
-#endif // PYARROW_ADAPTERS_BUILTIN_H
+#endif // PYARROW_ADAPTERS_BUILTIN_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/adapters/pandas.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc
index 38f3b6f..899eb55 100644
--- a/python/src/pyarrow/adapters/pandas.cc
+++ b/python/src/pyarrow/adapters/pandas.cc
@@ -28,10 +28,13 @@
#include <memory>
#include <sstream>
#include <string>
+#include <unordered_map>
#include "arrow/api.h"
-#include "arrow/util/bit-util.h"
#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/bit-util.h"
+#include "arrow/util/macros.h"
#include "pyarrow/common.h"
#include "pyarrow/config.h"
@@ -40,10 +43,13 @@
namespace pyarrow {
using arrow::Array;
+using arrow::ChunkedArray;
using arrow::Column;
using arrow::Field;
using arrow::DataType;
using arrow::Status;
+using arrow::Table;
+using arrow::Type;
namespace BitUtil = arrow::BitUtil;
@@ -51,8 +57,7 @@ namespace BitUtil = arrow::BitUtil;
// Serialization
template <int TYPE>
-struct npy_traits {
-};
+struct npy_traits {};
template <>
struct npy_traits<NPY_BOOL> {
@@ -60,21 +65,17 @@ struct npy_traits<NPY_BOOL> {
using TypeClass = arrow::BooleanType;
static constexpr bool supports_nulls = false;
- static inline bool isnull(uint8_t v) {
- return false;
- }
+ static inline bool isnull(uint8_t v) { return false; }
};
-#define NPY_INT_DECL(TYPE, CapType, T) \
- template <> \
- struct npy_traits<NPY_##TYPE> { \
- typedef T value_type; \
- using TypeClass = arrow::CapType##Type; \
- \
- static constexpr bool supports_nulls = false; \
- static inline bool isnull(T v) { \
- return false; \
- } \
+#define NPY_INT_DECL(TYPE, CapType, T) \
+ template <> \
+ struct npy_traits<NPY_##TYPE> { \
+ typedef T value_type; \
+ using TypeClass = arrow::CapType##Type; \
+ \
+ static constexpr bool supports_nulls = false; \
+ static inline bool isnull(T v) { return false; } \
};
NPY_INT_DECL(INT8, Int8, int8_t);
@@ -93,9 +94,7 @@ struct npy_traits<NPY_FLOAT32> {
static constexpr bool supports_nulls = true;
- static inline bool isnull(float v) {
- return v != v;
- }
+ static inline bool isnull(float v) { return v != v; }
};
template <>
@@ -105,9 +104,7 @@ struct npy_traits<NPY_FLOAT64> {
static constexpr bool supports_nulls = true;
- static inline bool isnull(double v) {
- return v != v;
- }
+ static inline bool isnull(double v) { return v != v; }
};
template <>
@@ -135,18 +132,14 @@ struct npy_traits<NPY_OBJECT> {
template <int TYPE>
class ArrowSerializer {
public:
- ArrowSerializer(arrow::MemoryPool* pool, PyArrayObject* arr, PyArrayObject* mask) :
- pool_(pool),
- arr_(arr),
- mask_(mask) {
+ ArrowSerializer(arrow::MemoryPool* pool, PyArrayObject* arr, PyArrayObject* mask)
+ : pool_(pool), arr_(arr), mask_(mask) {
length_ = PyArray_SIZE(arr_);
}
Status Convert(std::shared_ptr<Array>* out);
- int stride() const {
- return PyArray_STRIDES(arr_)[0];
- }
+ int stride() const { return PyArray_STRIDES(arr_)[0]; }
Status InitNullBitmap() {
int null_bytes = BitUtil::BytesForBits(length_);
@@ -215,9 +208,7 @@ class ArrowSerializer {
const int32_t length = PyBytes_GET_SIZE(obj);
s = string_builder.Append(PyBytes_AS_STRING(obj), length);
Py_DECREF(obj);
- if (!s.ok()) {
- return s;
- }
+ if (!s.ok()) { return s; }
} else if (PyBytes_Check(obj)) {
have_bytes = true;
const int32_t length = PyBytes_GET_SIZE(obj);
@@ -230,8 +221,8 @@ class ArrowSerializer {
if (have_bytes) {
const auto& arr = static_cast<const arrow::StringArray&>(*out->get());
- *out = std::make_shared<arrow::BinaryArray>(arr.length(), arr.offsets(),
- arr.data(), arr.null_count(), arr.null_bitmap());
+ *out = std::make_shared<arrow::BinaryArray>(
+ arr.length(), arr.offsets(), arr.data(), arr.null_count(), arr.null_bitmap());
}
return Status::OK();
}
@@ -259,8 +250,7 @@ class ArrowSerializer {
}
}
- *out = std::make_shared<arrow::BooleanArray>(length_, data, null_count,
- null_bitmap_);
+ *out = std::make_shared<arrow::BooleanArray>(length_, data, null_count, null_bitmap_);
return Status::OK();
}
@@ -321,26 +311,27 @@ inline Status ArrowSerializer<TYPE>::MakeDataType(std::shared_ptr<DataType>* out
}
template <>
-inline Status ArrowSerializer<NPY_DATETIME>::MakeDataType(std::shared_ptr<DataType>* out) {
+inline Status ArrowSerializer<NPY_DATETIME>::MakeDataType(
+ std::shared_ptr<DataType>* out) {
PyArray_Descr* descr = PyArray_DESCR(arr_);
auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
arrow::TimestampType::Unit unit;
switch (date_dtype->meta.base) {
- case NPY_FR_s:
- unit = arrow::TimestampType::Unit::SECOND;
- break;
- case NPY_FR_ms:
- unit = arrow::TimestampType::Unit::MILLI;
- break;
- case NPY_FR_us:
- unit = arrow::TimestampType::Unit::MICRO;
- break;
- case NPY_FR_ns:
- unit = arrow::TimestampType::Unit::NANO;
- break;
- default:
- return Status::Invalid("Unknown NumPy datetime unit");
+ case NPY_FR_s:
+ unit = arrow::TimestampType::Unit::SECOND;
+ break;
+ case NPY_FR_ms:
+ unit = arrow::TimestampType::Unit::MILLI;
+ break;
+ case NPY_FR_us:
+ unit = arrow::TimestampType::Unit::MICRO;
+ break;
+ case NPY_FR_ns:
+ unit = arrow::TimestampType::Unit::NANO;
+ break;
+ default:
+ return Status::Invalid("Unknown NumPy datetime unit");
}
out->reset(new arrow::TimestampType(unit));
@@ -351,9 +342,7 @@ template <int TYPE>
inline Status ArrowSerializer<TYPE>::Convert(std::shared_ptr<Array>* out) {
typedef npy_traits<TYPE> traits;
- if (mask_ != nullptr || traits::supports_nulls) {
- RETURN_NOT_OK(InitNullBitmap());
- }
+ if (mask_ != nullptr || traits::supports_nulls) { RETURN_NOT_OK(InitNullBitmap()); }
int64_t null_count = 0;
if (mask_ != nullptr) {
@@ -429,9 +418,7 @@ inline Status ArrowSerializer<NPY_OBJECT>::Convert(std::shared_ptr<Array>* out)
template <int TYPE>
inline Status ArrowSerializer<TYPE>::ConvertData() {
// TODO(wesm): strided arrays
- if (is_strided()) {
- return Status::Invalid("no support for strided data yet");
- }
+ if (is_strided()) { return Status::Invalid("no support for strided data yet"); }
data_ = std::make_shared<NumPyBuffer>(arr_);
return Status::OK();
@@ -439,9 +426,7 @@ inline Status ArrowSerializer<TYPE>::ConvertData() {
template <>
inline Status ArrowSerializer<NPY_BOOL>::ConvertData() {
- if (is_strided()) {
- return Status::Invalid("no support for strided data yet");
- }
+ if (is_strided()) { return Status::Invalid("no support for strided data yet"); }
int nbytes = BitUtil::BytesForBits(length_);
auto buffer = std::make_shared<arrow::PoolBuffer>(pool_);
@@ -453,9 +438,7 @@ inline Status ArrowSerializer<NPY_BOOL>::ConvertData() {
memset(bitmap, 0, nbytes);
for (int i = 0; i < length_; ++i) {
- if (values[i] > 0) {
- BitUtil::SetBit(bitmap, i);
- }
+ if (values[i] > 0) { BitUtil::SetBit(bitmap, i); }
}
data_ = buffer;
@@ -468,29 +451,24 @@ inline Status ArrowSerializer<NPY_OBJECT>::ConvertData() {
return Status::TypeError("NYI");
}
+#define TO_ARROW_CASE(TYPE) \
+ case NPY_##TYPE: { \
+ ArrowSerializer<NPY_##TYPE> converter(pool, arr, mask); \
+ RETURN_NOT_OK(converter.Convert(out)); \
+ } break;
-#define TO_ARROW_CASE(TYPE) \
- case NPY_##TYPE: \
- { \
- ArrowSerializer<NPY_##TYPE> converter(pool, arr, mask); \
- RETURN_NOT_OK(converter.Convert(out)); \
- } \
- break;
-
-Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo,
- std::shared_ptr<Array>* out) {
+Status PandasMaskedToArrow(
+ arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, std::shared_ptr<Array>* out) {
PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(ao);
PyArrayObject* mask = nullptr;
- if (mo != nullptr) {
- mask = reinterpret_cast<PyArrayObject*>(mo);
- }
+ if (mo != nullptr) { mask = reinterpret_cast<PyArrayObject*>(mo); }
if (PyArray_NDIM(arr) != 1) {
return Status::Invalid("only handle 1-dimensional arrays");
}
- switch(PyArray_DESCR(arr)->type_num) {
+ switch (PyArray_DESCR(arr)->type_num) {
TO_ARROW_CASE(BOOL);
TO_ARROW_CASE(INT8);
TO_ARROW_CASE(INT16);
@@ -506,15 +484,13 @@ Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo,
TO_ARROW_CASE(OBJECT);
default:
std::stringstream ss;
- ss << "unsupported type " << PyArray_DESCR(arr)->type_num
- << std::endl;
+ ss << "unsupported type " << PyArray_DESCR(arr)->type_num << std::endl;
return Status::NotImplemented(ss.str());
}
return Status::OK();
}
-Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao,
- std::shared_ptr<Array>* out) {
+Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, std::shared_ptr<Array>* out) {
return PandasMaskedToArrow(pool, ao, nullptr, out);
}
@@ -522,28 +498,27 @@ Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao,
// Deserialization
template <int TYPE>
-struct arrow_traits {
-};
+struct arrow_traits {};
template <>
struct arrow_traits<arrow::Type::BOOL> {
static constexpr int npy_type = NPY_BOOL;
static constexpr bool supports_nulls = false;
static constexpr bool is_boolean = true;
- static constexpr bool is_pandas_numeric_not_nullable = false;
- static constexpr bool is_pandas_numeric_nullable = false;
+ static constexpr bool is_numeric_not_nullable = false;
+ static constexpr bool is_numeric_nullable = false;
};
-#define INT_DECL(TYPE) \
- template <> \
- struct arrow_traits<arrow::Type::TYPE> { \
- static constexpr int npy_type = NPY_##TYPE; \
- static constexpr bool supports_nulls = false; \
- static constexpr double na_value = NAN; \
- static constexpr bool is_boolean = false; \
- static constexpr bool is_pandas_numeric_not_nullable = true; \
- static constexpr bool is_pandas_numeric_nullable = false; \
- typedef typename npy_traits<NPY_##TYPE>::value_type T; \
+#define INT_DECL(TYPE) \
+ template <> \
+ struct arrow_traits<arrow::Type::TYPE> { \
+ static constexpr int npy_type = NPY_##TYPE; \
+ static constexpr bool supports_nulls = false; \
+ static constexpr double na_value = NAN; \
+ static constexpr bool is_boolean = false; \
+ static constexpr bool is_numeric_not_nullable = true; \
+ static constexpr bool is_numeric_nullable = false; \
+ typedef typename npy_traits<NPY_##TYPE>::value_type T; \
};
INT_DECL(INT8);
@@ -561,8 +536,8 @@ struct arrow_traits<arrow::Type::FLOAT> {
static constexpr bool supports_nulls = true;
static constexpr float na_value = NAN;
static constexpr bool is_boolean = false;
- static constexpr bool is_pandas_numeric_not_nullable = false;
- static constexpr bool is_pandas_numeric_nullable = true;
+ static constexpr bool is_numeric_not_nullable = false;
+ static constexpr bool is_numeric_nullable = true;
typedef typename npy_traits<NPY_FLOAT32>::value_type T;
};
@@ -572,19 +547,21 @@ struct arrow_traits<arrow::Type::DOUBLE> {
static constexpr bool supports_nulls = true;
static constexpr double na_value = NAN;
static constexpr bool is_boolean = false;
- static constexpr bool is_pandas_numeric_not_nullable = false;
- static constexpr bool is_pandas_numeric_nullable = true;
+ static constexpr bool is_numeric_not_nullable = false;
+ static constexpr bool is_numeric_nullable = true;
typedef typename npy_traits<NPY_FLOAT64>::value_type T;
};
+static constexpr int64_t kPandasTimestampNull = std::numeric_limits<int64_t>::min();
+
template <>
struct arrow_traits<arrow::Type::TIMESTAMP> {
static constexpr int npy_type = NPY_DATETIME;
static constexpr bool supports_nulls = true;
- static constexpr int64_t na_value = std::numeric_limits<int64_t>::min();
+ static constexpr int64_t na_value = kPandasTimestampNull;
static constexpr bool is_boolean = false;
- static constexpr bool is_pandas_numeric_not_nullable = false;
- static constexpr bool is_pandas_numeric_nullable = true;
+ static constexpr bool is_numeric_not_nullable = false;
+ static constexpr bool is_numeric_nullable = true;
typedef typename npy_traits<NPY_DATETIME>::value_type T;
};
@@ -592,10 +569,10 @@ template <>
struct arrow_traits<arrow::Type::DATE> {
static constexpr int npy_type = NPY_DATETIME;
static constexpr bool supports_nulls = true;
- static constexpr int64_t na_value = std::numeric_limits<int64_t>::min();
+ static constexpr int64_t na_value = kPandasTimestampNull;
static constexpr bool is_boolean = false;
- static constexpr bool is_pandas_numeric_not_nullable = false;
- static constexpr bool is_pandas_numeric_nullable = true;
+ static constexpr bool is_numeric_not_nullable = false;
+ static constexpr bool is_numeric_nullable = true;
typedef typename npy_traits<NPY_DATETIME>::value_type T;
};
@@ -604,18 +581,39 @@ struct arrow_traits<arrow::Type::STRING> {
static constexpr int npy_type = NPY_OBJECT;
static constexpr bool supports_nulls = true;
static constexpr bool is_boolean = false;
- static constexpr bool is_pandas_numeric_not_nullable = false;
- static constexpr bool is_pandas_numeric_nullable = false;
+ static constexpr bool is_numeric_not_nullable = false;
+ static constexpr bool is_numeric_nullable = false;
+};
+
+template <>
+struct arrow_traits<arrow::Type::BINARY> {
+ static constexpr int npy_type = NPY_OBJECT;
+ static constexpr bool supports_nulls = true;
+ static constexpr bool is_boolean = false;
+ static constexpr bool is_numeric_not_nullable = false;
+ static constexpr bool is_numeric_nullable = false;
};
+template <typename T>
+struct WrapBytes {};
-static inline PyObject* make_pystring(const uint8_t* data, int32_t length) {
+template <>
+struct WrapBytes<arrow::StringArray> {
+ static inline PyObject* Wrap(const uint8_t* data, int64_t length) {
#if PY_MAJOR_VERSION >= 3
- return PyUnicode_FromStringAndSize(reinterpret_cast<const char*>(data), length);
+ return PyUnicode_FromStringAndSize(reinterpret_cast<const char*>(data), length);
#else
- return PyString_FromStringAndSize(reinterpret_cast<const char*>(data), length);
+ return PyString_FromStringAndSize(reinterpret_cast<const char*>(data), length);
#endif
-}
+ }
+};
+
+template <>
+struct WrapBytes<arrow::BinaryArray> {
+ static inline PyObject* Wrap(const uint8_t* data, int64_t length) {
+ return PyBytes_FromStringAndSize(reinterpret_cast<const char*>(data), length);
+ }
+};
inline void set_numpy_metadata(int type, DataType* datatype, PyArrayObject* out) {
if (type == NPY_DATETIME) {
@@ -645,20 +643,169 @@ inline void set_numpy_metadata(int type, DataType* datatype, PyArrayObject* out)
}
}
-template <int TYPE>
-class ArrowDeserializer {
- public:
- ArrowDeserializer(const std::shared_ptr<Column>& col, PyObject* py_ref) :
- col_(col), py_ref_(py_ref) {}
+template <typename T>
+inline void ConvertIntegerWithNulls(const ChunkedArray& data, double* out_values) {
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const std::shared_ptr<Array> arr = data.chunk(c);
+ auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
+ auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+ // Upcast to double, set NaN as appropriate
- Status Convert(PyObject** out) {
- const std::shared_ptr<arrow::ChunkedArray> data = col_->data();
+ for (int i = 0; i < arr->length(); ++i) {
+ *out_values++ = prim_arr->IsNull(i) ? NAN : in_values[i];
+ }
+ }
+}
- RETURN_NOT_OK(ConvertValues<TYPE>(data));
- *out = reinterpret_cast<PyObject*>(out_);
+template <typename T>
+inline void ConvertIntegerNoNullsSameType(const ChunkedArray& data, T* out_values) {
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const std::shared_ptr<Array> arr = data.chunk(c);
+ auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
+ auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+ memcpy(out_values, in_values, sizeof(T) * arr->length());
+ out_values += arr->length();
+ }
+}
- return Status::OK();
+template <typename InType, typename OutType>
+inline void ConvertIntegerNoNullsCast(const ChunkedArray& data, OutType* out_values) {
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const std::shared_ptr<Array> arr = data.chunk(c);
+ auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
+ auto in_values = reinterpret_cast<const InType*>(prim_arr->data()->data());
+ for (int32_t i = 0; i < arr->length(); ++i) {
+ *out_values = in_values[i];
+ }
+ }
+}
+
+static Status ConvertBooleanWithNulls(const ChunkedArray& data, PyObject** out_values) {
+ PyAcquireGIL lock;
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const std::shared_ptr<Array> arr = data.chunk(c);
+ auto bool_arr = static_cast<arrow::BooleanArray*>(arr.get());
+
+ for (int64_t i = 0; i < arr->length(); ++i) {
+ if (bool_arr->IsNull(i)) {
+ Py_INCREF(Py_None);
+ *out_values++ = Py_None;
+ } else if (bool_arr->Value(i)) {
+ // True
+ Py_INCREF(Py_True);
+ *out_values++ = Py_True;
+ } else {
+ // False
+ Py_INCREF(Py_False);
+ *out_values++ = Py_False;
+ }
+ }
+ }
+ return Status::OK();
+}
+
+static void ConvertBooleanNoNulls(const ChunkedArray& data, uint8_t* out_values) {
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const std::shared_ptr<Array> arr = data.chunk(c);
+ auto bool_arr = static_cast<arrow::BooleanArray*>(arr.get());
+ for (int64_t i = 0; i < arr->length(); ++i) {
+ *out_values++ = static_cast<uint8_t>(bool_arr->Value(i));
+ }
+ }
+}
+
+template <typename ArrayType>
+inline Status ConvertBinaryLike(const ChunkedArray& data, PyObject** out_values) {
+ PyAcquireGIL lock;
+ for (int c = 0; c < data.num_chunks(); c++) {
+ auto arr = static_cast<ArrayType*>(data.chunk(c).get());
+
+ const uint8_t* data_ptr;
+ int32_t length;
+ const bool has_nulls = data.null_count() > 0;
+ for (int64_t i = 0; i < arr->length(); ++i) {
+ if (has_nulls && arr->IsNull(i)) {
+ Py_INCREF(Py_None);
+ *out_values = Py_None;
+ } else {
+ data_ptr = arr->GetValue(i, &length);
+ *out_values = WrapBytes<ArrayType>::Wrap(data_ptr, length);
+ if (*out_values == nullptr) {
+ return Status::UnknownError("String initialization failed");
+ }
+ }
+ ++out_values;
+ }
+ }
+ return Status::OK();
+}
+
+template <typename T>
+inline void ConvertNumericNullable(const ChunkedArray& data, T na_value, T* out_values) {
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const std::shared_ptr<Array> arr = data.chunk(c);
+ auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
+ auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+
+ const uint8_t* valid_bits = arr->null_bitmap_data();
+
+ if (arr->null_count() > 0) {
+ for (int64_t i = 0; i < arr->length(); ++i) {
+ *out_values++ = BitUtil::BitNotSet(valid_bits, i) ? na_value : in_values[i];
+ }
+ } else {
+ memcpy(out_values, in_values, sizeof(T) * arr->length());
+ out_values += arr->length();
+ }
}
+}
+
+template <typename InType, typename OutType>
+inline void ConvertNumericNullableCast(
+ const ChunkedArray& data, OutType na_value, OutType* out_values) {
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const std::shared_ptr<Array> arr = data.chunk(c);
+ auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
+ auto in_values = reinterpret_cast<const InType*>(prim_arr->data()->data());
+
+ for (int64_t i = 0; i < arr->length(); ++i) {
+ *out_values++ = arr->IsNull(i) ? na_value : static_cast<OutType>(in_values[i]);
+ }
+ }
+}
+
+template <typename T>
+inline void ConvertDates(const ChunkedArray& data, T na_value, T* out_values) {
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const std::shared_ptr<Array> arr = data.chunk(c);
+ auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
+ auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+
+ for (int64_t i = 0; i < arr->length(); ++i) {
+ // There are 1000 * 60 * 60 * 24 = 86400000ms in a day
+ *out_values++ = arr->IsNull(i) ? na_value : in_values[i] / 86400000;
+ }
+ }
+}
+
+template <typename InType, int SHIFT>
+inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* out_values) {
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const std::shared_ptr<Array> arr = data.chunk(c);
+ auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
+ auto in_values = reinterpret_cast<const InType*>(prim_arr->data()->data());
+
+ for (int64_t i = 0; i < arr->length(); ++i) {
+ *out_values++ = arr->IsNull(i) ? kPandasTimestampNull
+ : (static_cast<int64_t>(in_values[i]) * SHIFT);
+ }
+ }
+}
+
+class ArrowDeserializer {
+ public:
+ ArrowDeserializer(const std::shared_ptr<Column>& col, PyObject* py_ref)
+ : col_(col), data_(*col->data().get()), py_ref_(py_ref) {}
Status AllocateOutput(int type) {
PyAcquireGIL lock;
@@ -676,20 +823,29 @@ class ArrowDeserializer {
return Status::OK();
}
- Status OutputFromData(int type, void* data) {
+ template <int TYPE>
+ Status ConvertValuesZeroCopy(int npy_type, std::shared_ptr<Array> arr) {
+ typedef typename arrow_traits<TYPE>::T T;
+
+ auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
+ auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+
+ // Zero-Copy. We can pass the data pointer directly to NumPy.
+ void* data = const_cast<T*>(in_values);
+
PyAcquireGIL lock;
// Zero-Copy. We can pass the data pointer directly to NumPy.
npy_intp dims[1] = {col_->length()};
- out_ = reinterpret_cast<PyArrayObject*>(PyArray_SimpleNewFromData(1, dims,
- type, data));
+ out_ = reinterpret_cast<PyArrayObject*>(
+ PyArray_SimpleNewFromData(1, dims, npy_type, data));
if (out_ == NULL) {
// Error occurred, trust that SimpleNew set the error state
return Status::OK();
}
- set_numpy_metadata(type, col_->type().get(), out_);
+ set_numpy_metadata(npy_type, col_->type().get(), out_);
if (PyArray_SetBaseObject(out_, py_ref_) == -1) {
// Error occurred, trust that SetBaseObject set the error state
@@ -705,317 +861,621 @@ class ArrowDeserializer {
return Status::OK();
}
- template <int T2>
- Status ConvertValuesZeroCopy(std::shared_ptr<Array> arr) {
- typedef typename arrow_traits<T2>::T T;
+ // ----------------------------------------------------------------------
+ // Allocate new array and deserialize. Can do a zero copy conversion for some
+ // types
- auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
- auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+ Status Convert(PyObject** out) {
+#define CONVERT_CASE(TYPE) \
+ case arrow::Type::TYPE: { \
+ RETURN_NOT_OK(ConvertValues<arrow::Type::TYPE>()); \
+ } break;
+
+ switch (col_->type()->type) {
+ CONVERT_CASE(BOOL);
+ CONVERT_CASE(INT8);
+ CONVERT_CASE(INT16);
+ CONVERT_CASE(INT32);
+ CONVERT_CASE(INT64);
+ CONVERT_CASE(UINT8);
+ CONVERT_CASE(UINT16);
+ CONVERT_CASE(UINT32);
+ CONVERT_CASE(UINT64);
+ CONVERT_CASE(FLOAT);
+ CONVERT_CASE(DOUBLE);
+ CONVERT_CASE(BINARY);
+ CONVERT_CASE(STRING);
+ CONVERT_CASE(DATE);
+ CONVERT_CASE(TIMESTAMP);
+ default:
+ return Status::NotImplemented("Arrow type reading not implemented");
+ }
- // Zero-Copy. We can pass the data pointer directly to NumPy.
- void* data = const_cast<T*>(in_values);
- int type = arrow_traits<TYPE>::npy_type;
- RETURN_NOT_OK(OutputFromData(type, data));
+#undef CONVERT_CASE
+ *out = reinterpret_cast<PyObject*>(out_);
return Status::OK();
}
- template <int T2>
+ template <int TYPE>
inline typename std::enable_if<
- (T2 != arrow::Type::DATE) & arrow_traits<T2>::is_pandas_numeric_nullable, Status>::type
- ConvertValues(const std::shared_ptr<arrow::ChunkedArray>& data) {
- typedef typename arrow_traits<T2>::T T;
- size_t chunk_offset = 0;
+ (TYPE != arrow::Type::DATE) & arrow_traits<TYPE>::is_numeric_nullable, Status>::type
+ ConvertValues() {
+ typedef typename arrow_traits<TYPE>::T T;
+ int npy_type = arrow_traits<TYPE>::npy_type;
- if (data->num_chunks() == 1 && data->null_count() == 0) {
- return ConvertValuesZeroCopy<TYPE>(data->chunk(0));
+ if (data_.num_chunks() == 1 && data_.null_count() == 0) {
+ return ConvertValuesZeroCopy<TYPE>(npy_type, data_.chunk(0));
}
- RETURN_NOT_OK(AllocateOutput(arrow_traits<T2>::npy_type));
+ RETURN_NOT_OK(AllocateOutput(npy_type));
+ auto out_values = reinterpret_cast<T*>(PyArray_DATA(out_));
+ ConvertNumericNullable<T>(data_, arrow_traits<TYPE>::na_value, out_values);
- for (int c = 0; c < data->num_chunks(); c++) {
- const std::shared_ptr<Array> arr = data->chunk(c);
- auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
- auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
- auto out_values = reinterpret_cast<T*>(PyArray_DATA(out_)) + chunk_offset;
+ return Status::OK();
+ }
- if (arr->null_count() > 0) {
- for (int64_t i = 0; i < arr->length(); ++i) {
- out_values[i] = arr->IsNull(i) ? arrow_traits<T2>::na_value : in_values[i];
- }
- } else {
- memcpy(out_values, in_values, sizeof(T) * arr->length());
- }
+ template <int TYPE>
+ inline typename std::enable_if<TYPE == arrow::Type::DATE, Status>::type
+ ConvertValues() {
+ typedef typename arrow_traits<TYPE>::T T;
+
+ RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type));
+ auto out_values = reinterpret_cast<T*>(PyArray_DATA(out_));
+ ConvertDates<T>(data_, arrow_traits<TYPE>::na_value, out_values);
+ return Status::OK();
+ }
- chunk_offset += arr->length();
+ // Integer specialization
+ template <int TYPE>
+ inline
+ typename std::enable_if<arrow_traits<TYPE>::is_numeric_not_nullable, Status>::type
+ ConvertValues() {
+ typedef typename arrow_traits<TYPE>::T T;
+ int npy_type = arrow_traits<TYPE>::npy_type;
+
+ if (data_.num_chunks() == 1 && data_.null_count() == 0) {
+ return ConvertValuesZeroCopy<TYPE>(npy_type, data_.chunk(0));
+ }
+
+ if (data_.null_count() > 0) {
+ RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64));
+ auto out_values = reinterpret_cast<double*>(PyArray_DATA(out_));
+ ConvertIntegerWithNulls<T>(data_, out_values);
+ } else {
+ RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type));
+ auto out_values = reinterpret_cast<T*>(PyArray_DATA(out_));
+ ConvertIntegerNoNullsSameType<T>(data_, out_values);
}
return Status::OK();
}
+ // Boolean specialization
+ template <int TYPE>
+ inline typename std::enable_if<arrow_traits<TYPE>::is_boolean, Status>::type
+ ConvertValues() {
+ if (data_.null_count() > 0) {
+ RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
+ auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(out_));
+ RETURN_NOT_OK(ConvertBooleanWithNulls(data_, out_values));
+ } else {
+ RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type));
+ auto out_values = reinterpret_cast<uint8_t*>(PyArray_DATA(out_));
+ ConvertBooleanNoNulls(data_, out_values);
+ }
+ return Status::OK();
+ }
+
+ // UTF8 strings
+ template <int TYPE>
+ inline typename std::enable_if<TYPE == arrow::Type::STRING, Status>::type
+ ConvertValues() {
+ RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
+ auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(out_));
+ return ConvertBinaryLike<arrow::StringArray>(data_, out_values);
+ }
+
template <int T2>
- inline typename std::enable_if<
- T2 == arrow::Type::DATE, Status>::type
- ConvertValues(const std::shared_ptr<arrow::ChunkedArray>& data) {
- typedef typename arrow_traits<T2>::T T;
- size_t chunk_offset = 0;
+ inline typename std::enable_if<T2 == arrow::Type::BINARY, Status>::type
+ ConvertValues() {
+ RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
+ auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(out_));
+ return ConvertBinaryLike<arrow::BinaryArray>(data_, out_values);
+ }
+
+ private:
+ std::shared_ptr<Column> col_;
+ const arrow::ChunkedArray& data_;
+ PyObject* py_ref_;
+ PyArrayObject* out_;
+};
+
+Status ConvertArrayToPandas(
+ const std::shared_ptr<Array>& arr, PyObject* py_ref, PyObject** out) {
+ static std::string dummy_name = "dummy";
+ auto field = std::make_shared<Field>(dummy_name, arr->type());
+ auto col = std::make_shared<Column>(field, arr);
+ return ConvertColumnToPandas(col, py_ref, out);
+}
- RETURN_NOT_OK(AllocateOutput(arrow_traits<T2>::npy_type));
+Status ConvertColumnToPandas(
+ const std::shared_ptr<Column>& col, PyObject* py_ref, PyObject** out) {
+ ArrowDeserializer converter(col, py_ref);
+ return converter.Convert(out);
+}
- for (int c = 0; c < data->num_chunks(); c++) {
- const std::shared_ptr<Array> arr = data->chunk(c);
- auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
- auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
- auto out_values = reinterpret_cast<T*>(PyArray_DATA(out_)) + chunk_offset;
+// ----------------------------------------------------------------------
+// pandas 0.x DataFrame conversion internals
- for (int64_t i = 0; i < arr->length(); ++i) {
- // There are 1000 * 60 * 60 * 24 = 86400000ms in a day
- out_values[i] = arr->IsNull(i) ? arrow_traits<T2>::na_value : in_values[i] / 86400000;
- }
+class PandasBlock {
+ public:
+ enum type {
+ OBJECT,
+ UINT8,
+ INT8,
+ UINT16,
+ INT16,
+ UINT32,
+ INT32,
+ UINT64,
+ INT64,
+ FLOAT,
+ DOUBLE,
+ BOOL,
+ DATETIME,
+ CATEGORICAL
+ };
+
+ PandasBlock(int64_t num_rows, int num_columns)
+ : num_rows_(num_rows), num_columns_(num_columns) {}
+
+ virtual Status Allocate() = 0;
+ virtual Status WriteNext(const std::shared_ptr<Column>& col, int64_t placement) = 0;
- chunk_offset += arr->length();
+ PyObject* block_arr() { return block_arr_.obj(); }
+
+ PyObject* placement_arr() { return placement_arr_.obj(); }
+
+ protected:
+ Status AllocateNDArray(int npy_type) {
+ PyAcquireGIL lock;
+
+ npy_intp block_dims[2] = {num_columns_, num_rows_};
+ PyObject* block_arr = PyArray_SimpleNew(2, block_dims, npy_type);
+ if (block_arr == NULL) {
+ // TODO(wesm): propagating Python exception
+ return Status::OK();
}
+ npy_intp placement_dims[1] = {num_columns_};
+ PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64);
+ if (placement_arr == NULL) {
+ // TODO(wesm): propagating Python exception
+ return Status::OK();
+ }
+
+ block_arr_.reset(block_arr);
+ placement_arr_.reset(placement_arr);
+ current_placement_index_ = 0;
+
+ block_data_ = reinterpret_cast<uint8_t*>(
+ PyArray_DATA(reinterpret_cast<PyArrayObject*>(block_arr)));
+
+ placement_data_ = reinterpret_cast<int64_t*>(
+ PyArray_DATA(reinterpret_cast<PyArrayObject*>(placement_arr)));
+
return Status::OK();
}
- // Integer specialization
- template <int T2>
- inline typename std::enable_if<
- arrow_traits<T2>::is_pandas_numeric_not_nullable, Status>::type
- ConvertValues(const std::shared_ptr<arrow::ChunkedArray>& data) {
- typedef typename arrow_traits<T2>::T T;
- size_t chunk_offset = 0;
+ int64_t num_rows_;
+ int num_columns_;
+ int current_placement_index_;
- if (data->num_chunks() == 1 && data->null_count() == 0) {
- return ConvertValuesZeroCopy<TYPE>(data->chunk(0));
- }
+ OwnedRef block_arr_;
+ uint8_t* block_data_;
- if (data->null_count() > 0) {
- RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64));
+ // ndarray<int32>
+ OwnedRef placement_arr_;
+ int64_t* placement_data_;
- for (int c = 0; c < data->num_chunks(); c++) {
- const std::shared_ptr<Array> arr = data->chunk(c);
- auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
- auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
- // Upcast to double, set NaN as appropriate
- auto out_values = reinterpret_cast<double*>(PyArray_DATA(out_)) + chunk_offset;
+ DISALLOW_COPY_AND_ASSIGN(PandasBlock);
+};
- for (int i = 0; i < arr->length(); ++i) {
- out_values[i] = prim_arr->IsNull(i) ? NAN : in_values[i];
- }
+class ObjectBlock : public PandasBlock {
+ public:
+ using PandasBlock::PandasBlock;
- chunk_offset += arr->length();
- }
- } else {
- RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type));
+ Status Allocate() override { return AllocateNDArray(NPY_OBJECT); }
- for (int c = 0; c < data->num_chunks(); c++) {
- const std::shared_ptr<Array> arr = data->chunk(c);
- auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
- auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
- auto out_values = reinterpret_cast<T*>(PyArray_DATA(out_)) + chunk_offset;
+ Status WriteNext(const std::shared_ptr<Column>& col, int64_t placement) override {
+ Type::type type = col->type()->type;
- memcpy(out_values, in_values, sizeof(T) * arr->length());
+ PyObject** out_buffer =
+ reinterpret_cast<PyObject**>(block_data_) + current_placement_index_ * num_rows_;
- chunk_offset += arr->length();
- }
+ const ChunkedArray& data = *col->data().get();
+
+ if (type == Type::BOOL) {
+ RETURN_NOT_OK(ConvertBooleanWithNulls(data, out_buffer));
+ } else if (type == Type::BINARY) {
+ RETURN_NOT_OK(ConvertBinaryLike<arrow::BinaryArray>(data, out_buffer));
+ } else if (type == Type::STRING) {
+ RETURN_NOT_OK(ConvertBinaryLike<arrow::StringArray>(data, out_buffer));
+ } else {
+ std::stringstream ss;
+ ss << "Unsupported type for object array output: " << col->type()->ToString();
+ return Status::NotImplemented(ss.str());
}
+ placement_data_[current_placement_index_++] = placement;
return Status::OK();
}
+};
- // Boolean specialization
- template <int T2>
- inline typename std::enable_if<
- arrow_traits<T2>::is_boolean, Status>::type
- ConvertValues(const std::shared_ptr<arrow::ChunkedArray>& data) {
- size_t chunk_offset = 0;
- PyAcquireGIL lock;
+template <int ARROW_TYPE, typename C_TYPE>
+class IntBlock : public PandasBlock {
+ public:
+ using PandasBlock::PandasBlock;
- if (data->null_count() > 0) {
- RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
+ Status Allocate() override {
+ return AllocateNDArray(arrow_traits<ARROW_TYPE>::npy_type);
+ }
- for (int c = 0; c < data->num_chunks(); c++) {
- const std::shared_ptr<Array> arr = data->chunk(c);
- auto bool_arr = static_cast<arrow::BooleanArray*>(arr.get());
- auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(out_)) + chunk_offset;
-
- for (int64_t i = 0; i < arr->length(); ++i) {
- if (bool_arr->IsNull(i)) {
- Py_INCREF(Py_None);
- out_values[i] = Py_None;
- } else if (bool_arr->Value(i)) {
- // True
- Py_INCREF(Py_True);
- out_values[i] = Py_True;
- } else {
- // False
- Py_INCREF(Py_False);
- out_values[i] = Py_False;
- }
- }
+ Status WriteNext(const std::shared_ptr<Column>& col, int64_t placement) override {
+ Type::type type = col->type()->type;
- chunk_offset += bool_arr->length();
- }
- } else {
- RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type));
+ C_TYPE* out_buffer =
+ reinterpret_cast<C_TYPE*>(block_data_) + current_placement_index_ * num_rows_;
- for (int c = 0; c < data->num_chunks(); c++) {
- const std::shared_ptr<Array> arr = data->chunk(c);
- auto bool_arr = static_cast<arrow::BooleanArray*>(arr.get());
- auto out_values = reinterpret_cast<uint8_t*>(PyArray_DATA(out_)) + chunk_offset;
+ const ChunkedArray& data = *col->data().get();
- for (int64_t i = 0; i < arr->length(); ++i) {
- out_values[i] = static_cast<uint8_t>(bool_arr->Value(i));
- }
+ if (type != ARROW_TYPE) { return Status::NotImplemented(col->type()->ToString()); }
- chunk_offset += bool_arr->length();
- }
+ ConvertIntegerNoNullsSameType<C_TYPE>(data, out_buffer);
+ placement_data_[current_placement_index_++] = placement;
+ return Status::OK();
+ }
+};
+
+using UInt8Block = IntBlock<Type::UINT8, uint8_t>;
+using Int8Block = IntBlock<Type::INT8, int8_t>;
+using UInt16Block = IntBlock<Type::UINT16, uint16_t>;
+using Int16Block = IntBlock<Type::INT16, int16_t>;
+using UInt32Block = IntBlock<Type::UINT32, uint32_t>;
+using Int32Block = IntBlock<Type::INT32, int32_t>;
+using UInt64Block = IntBlock<Type::UINT64, uint64_t>;
+using Int64Block = IntBlock<Type::INT64, int64_t>;
+
+class Float32Block : public PandasBlock {
+ public:
+ using PandasBlock::PandasBlock;
+
+ Status Allocate() override { return AllocateNDArray(NPY_FLOAT32); }
+
+ Status WriteNext(const std::shared_ptr<Column>& col, int64_t placement) override {
+ Type::type type = col->type()->type;
+
+ if (type != Type::FLOAT) { return Status::NotImplemented(col->type()->ToString()); }
+
+ float* out_buffer =
+ reinterpret_cast<float*>(block_data_) + current_placement_index_ * num_rows_;
+
+ ConvertNumericNullable<float>(*col->data().get(), NAN, out_buffer);
+ placement_data_[current_placement_index_++] = placement;
+ return Status::OK();
+ }
+};
+
+class Float64Block : public PandasBlock {
+ public:
+ using PandasBlock::PandasBlock;
+
+ Status Allocate() override { return AllocateNDArray(NPY_FLOAT64); }
+
+ Status WriteNext(const std::shared_ptr<Column>& col, int64_t placement) override {
+ Type::type type = col->type()->type;
+
+ double* out_buffer =
+ reinterpret_cast<double*>(block_data_) + current_placement_index_ * num_rows_;
+
+ const ChunkedArray& data = *col->data().get();
+
+#define INTEGER_CASE(IN_TYPE) \
+ ConvertIntegerWithNulls<IN_TYPE>(data, out_buffer); \
+ break;
+
+ switch (type) {
+ case Type::UINT8:
+ INTEGER_CASE(uint8_t);
+ case Type::INT8:
+ INTEGER_CASE(int8_t);
+ case Type::UINT16:
+ INTEGER_CASE(uint16_t);
+ case Type::INT16:
+ INTEGER_CASE(int16_t);
+ case Type::UINT32:
+ INTEGER_CASE(uint32_t);
+ case Type::INT32:
+ INTEGER_CASE(int32_t);
+ case Type::UINT64:
+ INTEGER_CASE(uint64_t);
+ case Type::INT64:
+ INTEGER_CASE(int64_t);
+ case Type::FLOAT:
+ ConvertNumericNullableCast<float, double>(data, NAN, out_buffer);
+ break;
+ case Type::DOUBLE:
+ ConvertNumericNullable<double>(data, NAN, out_buffer);
+ break;
+ default:
+ return Status::NotImplemented(col->type()->ToString());
}
+#undef INTEGER_CASE
+
+ placement_data_[current_placement_index_++] = placement;
return Status::OK();
}
+};
- // UTF8 strings
- template <int T2>
- inline typename std::enable_if<
- T2 == arrow::Type::STRING, Status>::type
- ConvertValues(const std::shared_ptr<arrow::ChunkedArray>& data) {
- size_t chunk_offset = 0;
- PyAcquireGIL lock;
+class BoolBlock : public PandasBlock {
+ public:
+ using PandasBlock::PandasBlock;
- RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
+ Status Allocate() override { return AllocateNDArray(NPY_BOOL); }
- for (int c = 0; c < data->num_chunks(); c++) {
- const std::shared_ptr<Array> arr = data->chunk(c);
- auto string_arr = static_cast<arrow::StringArray*>(arr.get());
- auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(out_)) + chunk_offset;
-
- const uint8_t* data_ptr;
- int32_t length;
- if (data->null_count() > 0) {
- for (int64_t i = 0; i < arr->length(); ++i) {
- if (string_arr->IsNull(i)) {
- Py_INCREF(Py_None);
- out_values[i] = Py_None;
- } else {
- data_ptr = string_arr->GetValue(i, &length);
-
- out_values[i] = make_pystring(data_ptr, length);
- if (out_values[i] == nullptr) {
- return Status::UnknownError("String initialization failed");
- }
- }
- }
+ Status WriteNext(const std::shared_ptr<Column>& col, int64_t placement) override {
+ Type::type type = col->type()->type;
+
+ if (type != Type::BOOL) { return Status::NotImplemented(col->type()->ToString()); }
+
+ uint8_t* out_buffer =
+ reinterpret_cast<uint8_t*>(block_data_) + current_placement_index_ * num_rows_;
+
+ ConvertBooleanNoNulls(*col->data().get(), out_buffer);
+ placement_data_[current_placement_index_++] = placement;
+ return Status::OK();
+ }
+};
+
+class DatetimeBlock : public PandasBlock {
+ public:
+ using PandasBlock::PandasBlock;
+
+ Status Allocate() override {
+ RETURN_NOT_OK(AllocateNDArray(NPY_DATETIME));
+
+ PyAcquireGIL lock;
+ auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(
+ PyArray_DESCR(reinterpret_cast<PyArrayObject*>(block_arr_.obj()))->c_metadata);
+ date_dtype->meta.base = NPY_FR_ns;
+ return Status::OK();
+ }
+
+ Status WriteNext(const std::shared_ptr<Column>& col, int64_t placement) override {
+ Type::type type = col->type()->type;
+
+ int64_t* out_buffer =
+ reinterpret_cast<int64_t*>(block_data_) + current_placement_index_ * num_rows_;
+
+ const ChunkedArray& data = *col.get()->data();
+
+ if (type == Type::DATE) {
+ // DateType is millisecond timestamp stored as int64_t
+ // TODO(wesm): Do we want to make sure to zero out the milliseconds?
+ ConvertDatetimeNanos<int64_t, 1000000L>(data, out_buffer);
+ } else if (type == Type::TIMESTAMP) {
+ auto ts_type = static_cast<arrow::TimestampType*>(col->type().get());
+
+ if (ts_type->unit == arrow::TimeUnit::NANO) {
+ ConvertNumericNullable<int64_t>(data, kPandasTimestampNull, out_buffer);
+ } else if (ts_type->unit == arrow::TimeUnit::MICRO) {
+ ConvertDatetimeNanos<int64_t, 1000L>(data, out_buffer);
+ } else if (ts_type->unit == arrow::TimeUnit::MILLI) {
+ ConvertDatetimeNanos<int64_t, 1000000L>(data, out_buffer);
+ } else if (ts_type->unit == arrow::TimeUnit::SECOND) {
+ ConvertDatetimeNanos<int64_t, 1000000000L>(data, out_buffer);
} else {
- for (int64_t i = 0; i < arr->length(); ++i) {
- data_ptr = string_arr->GetValue(i, &length);
- out_values[i] = make_pystring(data_ptr, length);
- if (out_values[i] == nullptr) {
- return Status::UnknownError("String initialization failed");
- }
- }
+ return Status::NotImplemented("Unsupported time unit");
}
-
- chunk_offset += string_arr->length();
+ } else {
+ return Status::NotImplemented(col->type()->ToString());
}
+ placement_data_[current_placement_index_++] = placement;
return Status::OK();
}
+};
- template <int T2>
- inline typename std::enable_if<
- T2 == arrow::Type::BINARY, Status>::type
- ConvertValues(const std::shared_ptr<arrow::ChunkedArray>& data) {
- size_t chunk_offset = 0;
- PyAcquireGIL lock;
+// class CategoricalBlock : public PandasBlock {};
- RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
+Status MakeBlock(PandasBlock::type type, int64_t num_rows, int num_columns,
+ std::shared_ptr<PandasBlock>* block) {
+#define BLOCK_CASE(NAME, TYPE) \
+ case PandasBlock::NAME: \
+ *block = std::make_shared<TYPE>(num_rows, num_columns); \
+ break;
- for (int c = 0; c < data->num_chunks(); c++) {
- const std::shared_ptr<Array> arr = data->chunk(c);
- auto binary_arr = static_cast<arrow::BinaryArray*>(arr.get());
- auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(out_)) + chunk_offset;
-
- const uint8_t* data_ptr;
- int32_t length;
- if (data->null_count() > 0) {
- for (int64_t i = 0; i < arr->length(); ++i) {
- if (binary_arr->IsNull(i)) {
- Py_INCREF(Py_None);
- out_values[i] = Py_None;
- } else {
- data_ptr = binary_arr->GetValue(i, &length);
-
- out_values[i] = PyBytes_FromStringAndSize(
- reinterpret_cast<const char*>(data_ptr), length);
- if (out_values[i] == nullptr) {
- return Status::UnknownError("String initialization failed");
- }
- }
- }
+ switch (type) {
+ BLOCK_CASE(OBJECT, ObjectBlock);
+ BLOCK_CASE(UINT8, UInt8Block);
+ BLOCK_CASE(INT8, Int8Block);
+ BLOCK_CASE(UINT16, UInt16Block);
+ BLOCK_CASE(INT16, Int16Block);
+ BLOCK_CASE(UINT32, UInt32Block);
+ BLOCK_CASE(INT32, Int32Block);
+ BLOCK_CASE(UINT64, UInt64Block);
+ BLOCK_CASE(INT64, Int64Block);
+ BLOCK_CASE(FLOAT, Float32Block);
+ BLOCK_CASE(DOUBLE, Float64Block);
+ BLOCK_CASE(BOOL, BoolBlock);
+ BLOCK_CASE(DATETIME, DatetimeBlock);
+ case PandasBlock::CATEGORICAL:
+ return Status::NotImplemented("categorical");
+ }
+
+#undef BLOCK_CASE
+
+ return (*block)->Allocate();
+}
+
+// Construct the exact pandas 0.x "BlockManager" memory layout
+//
+// * For each column determine the correct output pandas type
+// * Allocate 2D blocks (ncols x nrows) for each distinct data type in output
+// * Allocate block placement arrays
+// * Write Arrow columns out into each slice of memory; populate block
+// * placement arrays as we go
+class DataFrameBlockCreator {
+ public:
+ DataFrameBlockCreator(const std::shared_ptr<Table>& table) : table_(table) {}
+
+ Status Convert(int nthreads, PyObject** output) {
+ column_types_.resize(table_->num_columns());
+ type_counts_.clear();
+ blocks_.clear();
+
+ RETURN_NOT_OK(CountColumnTypes());
+ RETURN_NOT_OK(CreateBlocks());
+ RETURN_NOT_OK(WriteTableToBlocks(nthreads));
+
+ return GetResultList(output);
+ }
+
+ Status CountColumnTypes() {
+ for (int i = 0; i < table_->num_columns(); ++i) {
+ std::shared_ptr<Column> col = table_->column(i);
+ PandasBlock::type output_type;
+
+ switch (col->type()->type) {
+ case Type::BOOL:
+ output_type = col->null_count() > 0 ? PandasBlock::OBJECT : PandasBlock::BOOL;
+ break;
+ case Type::UINT8:
+ output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT8;
+ break;
+ case Type::INT8:
+ output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT8;
+ break;
+ case Type::UINT16:
+ output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT16;
+ break;
+ case Type::INT16:
+ output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT16;
+ break;
+ case Type::UINT32:
+ output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT32;
+ break;
+ case Type::INT32:
+ output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT32;
+ break;
+ case Type::INT64:
+ output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT64;
+ break;
+ case Type::UINT64:
+ output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT64;
+ break;
+ case Type::FLOAT:
+ output_type = PandasBlock::FLOAT;
+ break;
+ case Type::DOUBLE:
+ output_type = PandasBlock::DOUBLE;
+ break;
+ case Type::STRING:
+ case Type::BINARY:
+ output_type = PandasBlock::OBJECT;
+ break;
+ case Type::DATE:
+ output_type = PandasBlock::DATETIME;
+ break;
+ case Type::TIMESTAMP:
+ output_type = PandasBlock::DATETIME;
+ break;
+ default:
+ return Status::NotImplemented(col->type()->ToString());
+ }
+
+ auto it = type_counts_.find(output_type);
+ if (it != type_counts_.end()) {
+ // Increment count
+ it->second += 1;
} else {
- for (int64_t i = 0; i < arr->length(); ++i) {
- data_ptr = binary_arr->GetValue(i, &length);
- out_values[i] = PyBytes_FromStringAndSize(
- reinterpret_cast<const char*>(data_ptr), length);
- if (out_values[i] == nullptr) {
- return Status::UnknownError("String initialization failed");
- }
- }
+ // Add key to map
+ type_counts_[output_type] = 1;
}
- chunk_offset += binary_arr->length();
+ column_types_[i] = output_type;
}
+ return Status::OK();
+ }
+ Status CreateBlocks() {
+ for (const auto& it : type_counts_) {
+ PandasBlock::type type = static_cast<PandasBlock::type>(it.first);
+ std::shared_ptr<PandasBlock> block;
+ RETURN_NOT_OK(MakeBlock(type, table_->num_rows(), it.second, &block));
+ blocks_[type] = block;
+ }
return Status::OK();
}
- private:
- std::shared_ptr<Column> col_;
- PyObject* py_ref_;
- PyArrayObject* out_;
-};
+ Status WriteTableToBlocks(int nthreads) {
+ if (nthreads > 1) {
+ return Status::NotImplemented("multithreading not yet implemented");
+ }
-#define FROM_ARROW_CASE(TYPE) \
- case arrow::Type::TYPE: \
- { \
- ArrowDeserializer<arrow::Type::TYPE> converter(col, py_ref); \
- return converter.Convert(out); \
- } \
- break;
+ for (int i = 0; i < table_->num_columns(); ++i) {
+ std::shared_ptr<Column> col = table_->column(i);
+ PandasBlock::type output_type = column_types_[i];
-Status ConvertArrayToPandas(const std::shared_ptr<Array>& arr, PyObject* py_ref,
- PyObject** out) {
- static std::string dummy_name = "dummy";
- auto field = std::make_shared<Field>(dummy_name, arr->type());
- auto col = std::make_shared<Column>(field, arr);
- return ConvertColumnToPandas(col, py_ref, out);
-}
+ auto it = blocks_.find(output_type);
+ if (it == blocks_.end()) { return Status::KeyError("No block allocated"); }
+ RETURN_NOT_OK(it->second->WriteNext(col, i));
+ }
+ return Status::OK();
+ }
-Status ConvertColumnToPandas(const std::shared_ptr<Column>& col, PyObject* py_ref,
- PyObject** out) {
- switch(col->type()->type) {
- FROM_ARROW_CASE(BOOL);
- FROM_ARROW_CASE(INT8);
- FROM_ARROW_CASE(INT16);
- FROM_ARROW_CASE(INT32);
- FROM_ARROW_CASE(INT64);
- FROM_ARROW_CASE(UINT8);
- FROM_ARROW_CASE(UINT16);
- FROM_ARROW_CASE(UINT32);
- FROM_ARROW_CASE(UINT64);
- FROM_ARROW_CASE(FLOAT);
- FROM_ARROW_CASE(DOUBLE);
- FROM_ARROW_CASE(BINARY);
- FROM_ARROW_CASE(STRING);
- FROM_ARROW_CASE(DATE);
- FROM_ARROW_CASE(TIMESTAMP);
- default:
- return Status::NotImplemented("Arrow type reading not implemented");
+ Status GetResultList(PyObject** out) {
+ auto num_blocks = static_cast<Py_ssize_t>(blocks_.size());
+ PyObject* result = PyList_New(num_blocks);
+ RETURN_IF_PYERROR();
+
+ int i = 0;
+ for (const auto& it : blocks_) {
+ const std::shared_ptr<PandasBlock> block = it.second;
+
+ PyObject* item = PyTuple_New(2);
+ RETURN_IF_PYERROR();
+
+ PyObject* block_arr = block->block_arr();
+ PyObject* placement_arr = block->placement_arr();
+ Py_INCREF(block_arr);
+ Py_INCREF(placement_arr);
+ PyTuple_SET_ITEM(item, 0, block_arr);
+ PyTuple_SET_ITEM(item, 1, placement_arr);
+
+ if (PyList_SET_ITEM(result, i++, item) < 0) { RETURN_IF_PYERROR(); }
+ }
+ *out = result;
+ return Status::OK();
}
- return Status::OK();
+
+ private:
+ std::shared_ptr<Table> table_;
+ std::vector<PandasBlock::type> column_types_;
+
+ // block type -> type count
+ std::unordered_map<int, int> type_counts_;
+
+ // block type -> block
+ std::unordered_map<int, std::shared_ptr<PandasBlock>> blocks_;
+};
+
+Status ConvertTableToPandas(
+ const std::shared_ptr<Table>& table, int nthreads, PyObject** out) {
+ DataFrameBlockCreator helper(table);
+ return helper.Convert(nthreads, out);
}
-} // namespace pyarrow
+} // namespace pyarrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/adapters/pandas.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.h b/python/src/pyarrow/adapters/pandas.h
index 532495d..60dadd4 100644
--- a/python/src/pyarrow/adapters/pandas.h
+++ b/python/src/pyarrow/adapters/pandas.h
@@ -33,27 +33,42 @@ class Array;
class Column;
class MemoryPool;
class Status;
+class Table;
-} // namespace arrow
+} // namespace arrow
namespace pyarrow {
PYARROW_EXPORT
-arrow::Status ConvertArrayToPandas(const std::shared_ptr<arrow::Array>& arr,
- PyObject* py_ref, PyObject** out);
+arrow::Status ConvertArrayToPandas(
+ const std::shared_ptr<arrow::Array>& arr, PyObject* py_ref, PyObject** out);
PYARROW_EXPORT
-arrow::Status ConvertColumnToPandas(const std::shared_ptr<arrow::Column>& col,
- PyObject* py_ref, PyObject** out);
+arrow::Status ConvertColumnToPandas(
+ const std::shared_ptr<arrow::Column>& col, PyObject* py_ref, PyObject** out);
+
+struct PandasOptions {
+ bool strings_to_categorical;
+};
+
+// Convert a whole table as efficiently as possible to a pandas.DataFrame.
+//
+// The returned Python object is a list of tuples consisting of the exact 2D
+// BlockManager structure of the pandas.DataFrame used as of pandas 0.19.x.
+//
+// tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2])
+PYARROW_EXPORT
+arrow::Status ConvertTableToPandas(
+ const std::shared_ptr<arrow::Table>& table, int nthreads, PyObject** out);
PYARROW_EXPORT
arrow::Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo,
std::shared_ptr<arrow::Array>* out);
PYARROW_EXPORT
-arrow::Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao,
- std::shared_ptr<arrow::Array>* out);
+arrow::Status PandasToArrow(
+ arrow::MemoryPool* pool, PyObject* ao, std::shared_ptr<arrow::Array>* out);
-} // namespace pyarrow
+} // namespace pyarrow
-#endif // PYARROW_ADAPTERS_PANDAS_H
+#endif // PYARROW_ADAPTERS_PANDAS_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/api.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/api.h b/python/src/pyarrow/api.h
index 6dbbc45..f65cc09 100644
--- a/python/src/pyarrow/api.h
+++ b/python/src/pyarrow/api.h
@@ -23,4 +23,4 @@
#include "pyarrow/adapters/builtin.h"
#include "pyarrow/adapters/pandas.h"
-#endif // PYARROW_API_H
+#endif // PYARROW_API_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/common.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/common.cc b/python/src/pyarrow/common.cc
index fb4d349..8660ac8 100644
--- a/python/src/pyarrow/common.cc
+++ b/python/src/pyarrow/common.cc
@@ -73,7 +73,7 @@ arrow::MemoryPool* get_memory_pool() {
PyBytesBuffer::PyBytesBuffer(PyObject* obj)
: Buffer(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj)),
- PyBytes_GET_SIZE(obj)),
+ PyBytes_GET_SIZE(obj)),
obj_(obj) {
Py_INCREF(obj_);
}
@@ -83,4 +83,4 @@ PyBytesBuffer::~PyBytesBuffer() {
Py_DECREF(obj_);
}
-} // namespace pyarrow
+} // namespace pyarrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/common.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h
index 7e33826..639918d 100644
--- a/python/src/pyarrow/common.h
+++ b/python/src/pyarrow/common.h
@@ -24,7 +24,9 @@
#include "arrow/buffer.h"
#include "arrow/util/macros.h"
-namespace arrow { class MemoryPool; }
+namespace arrow {
+class MemoryPool;
+}
namespace pyarrow {
@@ -34,27 +36,18 @@ class OwnedRef {
public:
OwnedRef() : obj_(nullptr) {}
- OwnedRef(PyObject* obj) :
- obj_(obj) {}
+ OwnedRef(PyObject* obj) : obj_(obj) {}
- ~OwnedRef() {
- Py_XDECREF(obj_);
- }
+ ~OwnedRef() { Py_XDECREF(obj_); }
void reset(PyObject* obj) {
- if (obj_ != nullptr) {
- Py_XDECREF(obj_);
- }
+ if (obj_ != nullptr) { Py_XDECREF(obj_); }
obj_ = obj;
}
- void release() {
- obj_ = nullptr;
- }
+ void release() { obj_ = nullptr; }
- PyObject* obj() const{
- return obj_;
- }
+ PyObject* obj() const { return obj_; }
private:
PyObject* obj_;
@@ -78,13 +71,10 @@ struct PyObjectStringify {
class PyGILGuard {
public:
- PyGILGuard() {
- state_ = PyGILState_Ensure();
- }
+ PyGILGuard() { state_ = PyGILState_Ensure(); }
+
+ ~PyGILGuard() { PyGILState_Release(state_); }
- ~PyGILGuard() {
- PyGILState_Release(state_);
- }
private:
PyGILState_STATE state_;
DISALLOW_COPY_AND_ASSIGN(PyGILGuard);
@@ -108,8 +98,7 @@ PYARROW_EXPORT arrow::MemoryPool* get_memory_pool();
class PYARROW_EXPORT NumPyBuffer : public arrow::Buffer {
public:
- NumPyBuffer(PyArrayObject* arr)
- : Buffer(nullptr, 0) {
+ NumPyBuffer(PyArrayObject* arr) : Buffer(nullptr, 0) {
arr_ = arr;
Py_INCREF(arr);
@@ -118,9 +107,7 @@ class PYARROW_EXPORT NumPyBuffer : public arrow::Buffer {
capacity_ = size_;
}
- virtual ~NumPyBuffer() {
- Py_XDECREF(arr_);
- }
+ virtual ~NumPyBuffer() { Py_XDECREF(arr_); }
private:
PyArrayObject* arr_;
@@ -135,22 +122,17 @@ class PYARROW_EXPORT PyBytesBuffer : public arrow::Buffer {
PyObject* obj_;
};
-
class PyAcquireGIL {
public:
- PyAcquireGIL() {
- state_ = PyGILState_Ensure();
- }
+ PyAcquireGIL() { state_ = PyGILState_Ensure(); }
- ~PyAcquireGIL() {
- PyGILState_Release(state_);
- }
+ ~PyAcquireGIL() { PyGILState_Release(state_); }
private:
PyGILState_STATE state_;
DISALLOW_COPY_AND_ASSIGN(PyAcquireGIL);
};
-} // namespace pyarrow
+} // namespace pyarrow
-#endif // PYARROW_COMMON_H
+#endif // PYARROW_COMMON_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/config.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/config.cc b/python/src/pyarrow/config.cc
index 730d2db..e1002bf 100644
--- a/python/src/pyarrow/config.cc
+++ b/python/src/pyarrow/config.cc
@@ -21,8 +21,7 @@
namespace pyarrow {
-void pyarrow_init() {
-}
+void pyarrow_init() {}
PyObject* numpy_nan = nullptr;
@@ -31,4 +30,4 @@ void pyarrow_set_numpy_nan(PyObject* obj) {
numpy_nan = obj;
}
-} // namespace pyarrow
+} // namespace pyarrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/config.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/config.h b/python/src/pyarrow/config.h
index 82936b1..386ee4b 100644
--- a/python/src/pyarrow/config.h
+++ b/python/src/pyarrow/config.h
@@ -24,7 +24,7 @@
#include "pyarrow/visibility.h"
#if PY_MAJOR_VERSION >= 3
- #define PyString_Check PyUnicode_Check
+#define PyString_Check PyUnicode_Check
#endif
namespace pyarrow {
@@ -38,6 +38,6 @@ void pyarrow_init();
PYARROW_EXPORT
void pyarrow_set_numpy_nan(PyObject* obj);
-} // namespace pyarrow
+} // namespace pyarrow
-#endif // PYARROW_CONFIG_H
+#endif // PYARROW_CONFIG_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/helpers.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/helpers.cc b/python/src/pyarrow/helpers.cc
index b42199c..3f65032 100644
--- a/python/src/pyarrow/helpers.cc
+++ b/python/src/pyarrow/helpers.cc
@@ -23,36 +23,35 @@ using namespace arrow;
namespace pyarrow {
-
-#define GET_PRIMITIVE_TYPE(NAME, FACTORY) \
- case Type::NAME: \
- return FACTORY(); \
+#define GET_PRIMITIVE_TYPE(NAME, FACTORY) \
+ case Type::NAME: \
+ return FACTORY(); \
break;
std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
switch (type) {
case Type::NA:
return null();
- GET_PRIMITIVE_TYPE(UINT8, uint8);
- GET_PRIMITIVE_TYPE(INT8, int8);
- GET_PRIMITIVE_TYPE(UINT16, uint16);
- GET_PRIMITIVE_TYPE(INT16, int16);
- GET_PRIMITIVE_TYPE(UINT32, uint32);
- GET_PRIMITIVE_TYPE(INT32, int32);
- GET_PRIMITIVE_TYPE(UINT64, uint64);
- GET_PRIMITIVE_TYPE(INT64, int64);
- GET_PRIMITIVE_TYPE(DATE, date);
+ GET_PRIMITIVE_TYPE(UINT8, uint8);
+ GET_PRIMITIVE_TYPE(INT8, int8);
+ GET_PRIMITIVE_TYPE(UINT16, uint16);
+ GET_PRIMITIVE_TYPE(INT16, int16);
+ GET_PRIMITIVE_TYPE(UINT32, uint32);
+ GET_PRIMITIVE_TYPE(INT32, int32);
+ GET_PRIMITIVE_TYPE(UINT64, uint64);
+ GET_PRIMITIVE_TYPE(INT64, int64);
+ GET_PRIMITIVE_TYPE(DATE, date);
case Type::TIMESTAMP:
return arrow::timestamp(arrow::TimeUnit::MICRO);
break;
- GET_PRIMITIVE_TYPE(BOOL, boolean);
- GET_PRIMITIVE_TYPE(FLOAT, float32);
- GET_PRIMITIVE_TYPE(DOUBLE, float64);
- GET_PRIMITIVE_TYPE(BINARY, binary);
- GET_PRIMITIVE_TYPE(STRING, utf8);
+ GET_PRIMITIVE_TYPE(BOOL, boolean);
+ GET_PRIMITIVE_TYPE(FLOAT, float32);
+ GET_PRIMITIVE_TYPE(DOUBLE, float64);
+ GET_PRIMITIVE_TYPE(BINARY, binary);
+ GET_PRIMITIVE_TYPE(STRING, utf8);
default:
return nullptr;
}
}
-} // namespace pyarrow
+} // namespace pyarrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/helpers.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/helpers.h b/python/src/pyarrow/helpers.h
index 8334d97..788c3ee 100644
--- a/python/src/pyarrow/helpers.h
+++ b/python/src/pyarrow/helpers.h
@@ -31,6 +31,6 @@ using arrow::Type;
PYARROW_EXPORT
std::shared_ptr<DataType> GetPrimitiveType(Type::type type);
-} // namespace pyarrow
+} // namespace pyarrow
-#endif // PYARROW_HELPERS_H
+#endif // PYARROW_HELPERS_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/io.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/io.cc b/python/src/pyarrow/io.cc
index 12f5ba0..ac1aa63 100644
--- a/python/src/pyarrow/io.cc
+++ b/python/src/pyarrow/io.cc
@@ -33,8 +33,7 @@ namespace pyarrow {
// ----------------------------------------------------------------------
// Python file
-PythonFile::PythonFile(PyObject* file)
- : file_(file) {
+PythonFile::PythonFile(PyObject* file) : file_(file) {
Py_INCREF(file_);
}
@@ -81,8 +80,8 @@ Status PythonFile::Read(int64_t nbytes, PyObject** out) {
}
Status PythonFile::Write(const uint8_t* data, int64_t nbytes) {
- PyObject* py_data = PyBytes_FromStringAndSize(
- reinterpret_cast<const char*>(data), nbytes);
+ PyObject* py_data =
+ PyBytes_FromStringAndSize(reinterpret_cast<const char*>(data), nbytes);
ARROW_RETURN_NOT_OK(CheckPyError());
PyObject* result = PyObject_CallMethod(file_, "write", "(O)", py_data);
@@ -102,7 +101,7 @@ Status PythonFile::Tell(int64_t* position) {
// PyLong_AsLongLong can raise OverflowError
ARROW_RETURN_NOT_OK(CheckPyError());
- return Status::OK();
+ return Status::OK();
}
// ----------------------------------------------------------------------
@@ -156,7 +155,8 @@ Status PyReadableFile::Read(int64_t nbytes, std::shared_ptr<arrow::Buffer>* out)
Status PyReadableFile::GetSize(int64_t* size) {
PyGILGuard lock;
- int64_t current_position;;
+ int64_t current_position;
+ ;
ARROW_RETURN_NOT_OK(file_->Tell(¤t_position));
ARROW_RETURN_NOT_OK(file_->Seek(0, 2));
@@ -204,7 +204,7 @@ Status PyOutputStream::Write(const uint8_t* data, int64_t nbytes) {
PyBytesReader::PyBytesReader(PyObject* obj)
: arrow::io::BufferReader(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj)),
- PyBytes_GET_SIZE(obj)),
+ PyBytes_GET_SIZE(obj)),
obj_(obj) {
Py_INCREF(obj_);
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/io.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/io.h b/python/src/pyarrow/io.h
index e14aa8c..fd3e7c0 100644
--- a/python/src/pyarrow/io.h
+++ b/python/src/pyarrow/io.h
@@ -24,7 +24,9 @@
#include "pyarrow/config.h"
#include "pyarrow/visibility.h"
-namespace arrow { class MemoryPool; }
+namespace arrow {
+class MemoryPool;
+}
namespace pyarrow {
@@ -92,6 +94,6 @@ class PYARROW_EXPORT PyBytesReader : public arrow::io::BufferReader {
// TODO(wesm): seekable output files
-} // namespace pyarrow
+} // namespace pyarrow
#endif // PYARROW_IO_H