You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2019/05/27 16:01:38 UTC
[arrow] branch master updated: ARROW-5027: [Python] Python bindings
for JSON reader
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 207b350 ARROW-5027: [Python] Python bindings for JSON reader
207b350 is described below
commit 207b3507be82e92ebf29ec7d6d3b0bb86091c09a
Author: Philipp Moritz <pc...@gmail.com>
AuthorDate: Mon May 27 18:01:27 2019 +0200
ARROW-5027: [Python] Python bindings for JSON reader
This PR implements Python bindings for the JSON reader.
Author: Philipp Moritz <pc...@gmail.com>
Closes #4044 from pcmoritz/cython-read-json and squashes the following commits:
1148f43ed <Philipp Moritz> update
465e9d416 <Philipp Moritz> fixes and docstring
d75b02d18 <Philipp Moritz> add tests
b630845a7 <Philipp Moritz> temp commit
aa0aa3f85 <Philipp Moritz> update
b1742b00e <Philipp Moritz> update
9dfc978cf <Philipp Moritz> add absolute imports
c776e0f5e <Philipp Moritz> linting
bb614282b <Philipp Moritz> comment in again
46a8561fe <Philipp Moritz> update
619064571 <Philipp Moritz> update
364971c86 <Philipp Moritz> update
90a3510a8 <Philipp Moritz> update
4edb201f6 <Philipp Moritz> initial work on JSON reader python wrapper
---
python/CMakeLists.txt | 2 +-
python/pyarrow/__init__.pxd | 2 +
python/pyarrow/__init__.py | 2 +
python/pyarrow/_csv.pyx | 2 +
python/pyarrow/_cuda.pxd | 2 +
python/pyarrow/_cuda.pyx | 2 +
python/pyarrow/_flight.pyx | 2 +
python/pyarrow/_json.pyx | 194 +++++++++++++++++++++++++++++++
python/pyarrow/_orc.pxd | 2 +
python/pyarrow/_orc.pyx | 2 +
python/pyarrow/_parquet.pxd | 2 +
python/pyarrow/_parquet.pyx | 2 +
python/pyarrow/_plasma.pyx | 2 +
python/pyarrow/benchmark.py | 2 +
python/pyarrow/compat.py | 2 +
python/pyarrow/csv.py | 2 +
python/pyarrow/cuda.py | 2 +
python/pyarrow/feather.py | 2 +
python/pyarrow/filesystem.py | 2 +
python/pyarrow/flight.py | 2 +
python/pyarrow/gandiva.pyx | 2 +
python/pyarrow/hdfs.py | 2 +
python/pyarrow/includes/libarrow.pxd | 31 +++++
python/pyarrow/ipc.py | 2 +
python/pyarrow/{benchmark.py => json.py} | 4 +-
python/pyarrow/jvm.py | 2 +
python/pyarrow/lib.pxd | 2 +
python/pyarrow/lib.pyx | 2 +
python/pyarrow/orc.py | 2 +
python/pyarrow/pandas_compat.py | 2 +
python/pyarrow/parquet.py | 2 +
python/pyarrow/plasma.py | 2 +
python/pyarrow/serialization.py | 2 +
python/pyarrow/tests/test_json.py | 147 +++++++++++++++++++++++
python/pyarrow/types.py | 2 +
python/pyarrow/util.py | 2 +
python/setup.py | 1 +
37 files changed, 438 insertions(+), 3 deletions(-)
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 8f0a4d0..d7f1aba 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -378,7 +378,7 @@ if(UNIX)
set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
endif()
-set(CYTHON_EXTENSIONS lib _csv)
+set(CYTHON_EXTENSIONS lib _csv _json)
set(LINK_LIBS arrow_shared arrow_python_shared)
diff --git a/python/pyarrow/__init__.pxd b/python/pyarrow/__init__.pxd
index 4f43455..95cea5c 100644
--- a/python/pyarrow/__init__.pxd
+++ b/python/pyarrow/__init__.pxd
@@ -15,6 +15,8 @@
# specific language governing permissions and limitations
# under the License.
+from __future__ import absolute_import
+
from libcpp.memory cimport shared_ptr
from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CDataType,
CField, CRecordBatch, CSchema,
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 17916df..117b1d7 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -17,6 +17,8 @@
# flake8: noqa
+from __future__ import absolute_import
+
import os as _os
import sys as _sys
diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx
index cfed987..0cc424d 100644
--- a/python/pyarrow/_csv.pyx
+++ b/python/pyarrow/_csv.pyx
@@ -20,6 +20,8 @@
# cython: embedsignature = True
# cython: language_level = 3
+from __future__ import absolute_import
+
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.lib cimport (check_status, Field, MemoryPool, ensure_type,
diff --git a/python/pyarrow/_cuda.pxd b/python/pyarrow/_cuda.pxd
index 1180601..fb66413 100644
--- a/python/pyarrow/_cuda.pxd
+++ b/python/pyarrow/_cuda.pxd
@@ -17,6 +17,8 @@
# cython: language_level = 3
+from __future__ import absolute_import
+
from pyarrow.lib cimport *
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
diff --git a/python/pyarrow/_cuda.pyx b/python/pyarrow/_cuda.pyx
index 87be0e6..a9f51b0 100644
--- a/python/pyarrow/_cuda.pyx
+++ b/python/pyarrow/_cuda.pyx
@@ -16,6 +16,8 @@
# under the License.
+from __future__ import absolute_import
+
from pyarrow.compat import tobytes
from pyarrow.lib cimport *
from pyarrow.includes.libarrow_cuda cimport *
diff --git a/python/pyarrow/_flight.pyx b/python/pyarrow/_flight.pyx
index 806796f..c682635 100644
--- a/python/pyarrow/_flight.pyx
+++ b/python/pyarrow/_flight.pyx
@@ -17,6 +17,8 @@
# cython: language_level = 3
+from __future__ import absolute_import
+
import collections
import enum
diff --git a/python/pyarrow/_json.pyx b/python/pyarrow/_json.pyx
new file mode 100644
index 0000000..b5c839b
--- /dev/null
+++ b/python/pyarrow/_json.pyx
@@ -0,0 +1,194 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from __future__ import absolute_import
+
+from pyarrow.includes.common cimport *
+from pyarrow.includes.libarrow cimport *
+from pyarrow.lib cimport (check_status, Field, MemoryPool, ensure_type,
+ maybe_unbox_memory_pool, get_input_stream,
+ pyarrow_wrap_table, pyarrow_wrap_data_type,
+ pyarrow_unwrap_data_type, pyarrow_wrap_schema,
+ pyarrow_unwrap_schema)
+
+
+cdef class ReadOptions:
+ """
+ Options for reading JSON files.
+
+ Parameters
+ ----------
+ use_threads : bool, optional (default True)
+ Whether to use multiple threads to accelerate reading
+ block_size : int, optional
+ How much bytes to process at a time from the input stream.
+ This will determine multi-threading granularity as well as
+ the size of individual chunks in the Table.
+ """
+ cdef:
+ CJSONReadOptions options
+
+ # Avoid mistakingly creating attributes
+ __slots__ = ()
+
+ def __init__(self, use_threads=None, block_size=None):
+ self.options = CJSONReadOptions.Defaults()
+ if use_threads is not None:
+ self.use_threads = use_threads
+ if block_size is not None:
+ self.block_size = block_size
+
+ @property
+ def use_threads(self):
+ """
+ Whether to use multiple threads to accelerate reading.
+ """
+ return self.options.use_threads
+
+ @use_threads.setter
+ def use_threads(self, value):
+ self.options.use_threads = value
+
+ @property
+ def block_size(self):
+ """
+ How much bytes to process at a time from the input stream.
+ This will determine multi-threading granularity as well as
+ the size of individual chunks in the Table.
+ """
+ return self.options.block_size
+
+ @block_size.setter
+ def block_size(self, value):
+ self.options.block_size = value
+
+cdef class ParseOptions:
+ """
+ Options for parsing JSON files.
+
+ Parameters
+ ----------
+ explicit_schema: Schema, optional (default None)
+ Optional explicit schema (no type inference, ignores other fields).
+ newlines_in_values: bool, optional (default False)
+ Whether objects may be printed across multiple lines (for example
+ pretty printed). If false, input must end with an empty line.
+ """
+
+ cdef:
+ CJSONParseOptions options
+
+ __slots__ = ()
+
+ def __init__(self, explicit_schema=None, newlines_in_values=None):
+ self.options = CJSONParseOptions.Defaults()
+ if explicit_schema is not None:
+ self.explicit_schema = explicit_schema
+ if newlines_in_values is not None:
+ self.newlines_in_values = newlines_in_values
+
+ @property
+ def explicit_schema(self):
+ """
+ Optional explicit schema (no type inference, ignores other fields)
+ """
+ if self.options.explicit_schema.get() == NULL:
+ return None
+ else:
+ return pyarrow_wrap_schema(self.options.explicit_schema)
+
+ @explicit_schema.setter
+ def explicit_schema(self, value):
+ self.options.explicit_schema = pyarrow_unwrap_schema(value)
+
+ @property
+ def newlines_in_values(self):
+ """
+ Whether newline characters are allowed in JSON values.
+ Setting this to True reduces the performance of multi-threaded
+ JSON reading.
+ """
+ return self.options.newlines_in_values
+
+ @newlines_in_values.setter
+ def newlines_in_values(self, value):
+ self.options.newlines_in_values = value
+
+
+cdef _get_reader(input_file, shared_ptr[InputStream]* out):
+ use_memory_map = False
+ get_input_stream(input_file, use_memory_map, out)
+
+cdef _get_read_options(ReadOptions read_options, CJSONReadOptions* out):
+ if read_options is None:
+ out[0] = CJSONReadOptions.Defaults()
+ else:
+ out[0] = read_options.options
+
+cdef _get_parse_options(ParseOptions parse_options, CJSONParseOptions* out):
+ if parse_options is None:
+ out[0] = CJSONParseOptions.Defaults()
+ else:
+ out[0] = parse_options.options
+
+
+def read_json(input_file, read_options=None, parse_options=None,
+ MemoryPool memory_pool=None):
+ """
+ Read a Table from a stream of JSON data.
+
+ Parameters
+ ----------
+ input_file: string, path or file-like object
+ The location of JSON data.
+ read_options: ReadOptions, optional
+ Options for the JSON reader (see ReadOptions constructor for defaults)
+ parse_options: ParseOptions, optional
+ Options for the JSON parser
+ (see ParseOptions constructor for defaults)
+ memory_pool: MemoryPool, optional
+ Pool to allocate Table memory from
+
+ Returns
+ -------
+ :class:`pyarrow.Table`
+ Contents of the JSON file as a in-memory table.
+ """
+ cdef:
+ shared_ptr[InputStream] stream
+ CJSONReadOptions c_read_options
+ CJSONParseOptions c_parse_options
+ shared_ptr[CJSONReader] reader
+ shared_ptr[CTable] table
+
+ _get_reader(input_file, &stream)
+ _get_read_options(read_options, &c_read_options)
+ _get_parse_options(parse_options, &c_parse_options)
+
+ check_status(CJSONReader.Make(maybe_unbox_memory_pool(memory_pool),
+ stream, c_read_options, c_parse_options,
+ &reader))
+
+ with nogil:
+ check_status(reader.get().Read(&table))
+
+ return pyarrow_wrap_table(table)
diff --git a/python/pyarrow/_orc.pxd b/python/pyarrow/_orc.pxd
index 7304937..ebbf8be 100644
--- a/python/pyarrow/_orc.pxd
+++ b/python/pyarrow/_orc.pxd
@@ -18,6 +18,8 @@
# distutils: language = c++
# cython: language_level = 3
+from __future__ import absolute_import
+
from libc.string cimport const_char
from libcpp.vector cimport vector as std_vector
from pyarrow.includes.common cimport *
diff --git a/python/pyarrow/_orc.pyx b/python/pyarrow/_orc.pyx
index 9493f23..c9f5b2e 100644
--- a/python/pyarrow/_orc.pyx
+++ b/python/pyarrow/_orc.pyx
@@ -19,6 +19,8 @@
# distutils: language = c++
# cython: embedsignature = True
+from __future__ import absolute_import
+
from cython.operator cimport dereference as deref
from libcpp.vector cimport vector as std_vector
from pyarrow.includes.common cimport *
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index 75c0015..8a6bf73 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -18,6 +18,8 @@
# distutils: language = c++
# cython: language_level = 3
+from __future__ import absolute_import
+
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport (CChunkedArray, CSchema, CStatus,
CTable, CMemoryPool, CBuffer,
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index db7f0c4..a4300cd 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -19,6 +19,8 @@
# distutils: language = c++
# cython: embedsignature = True
+from __future__ import absolute_import
+
import io
import six
import warnings
diff --git a/python/pyarrow/_plasma.pyx b/python/pyarrow/_plasma.pyx
index 5be724d..e352377 100644
--- a/python/pyarrow/_plasma.pyx
+++ b/python/pyarrow/_plasma.pyx
@@ -20,6 +20,8 @@
# cython: embedsignature = True
# cython: language_level = 3
+from __future__ import absolute_import
+
from libcpp cimport bool as c_bool, nullptr
from libcpp.memory cimport shared_ptr, unique_ptr, make_shared
from libcpp.string cimport string as c_string
diff --git a/python/pyarrow/benchmark.py b/python/pyarrow/benchmark.py
index ef1ef53..e8e38a4 100644
--- a/python/pyarrow/benchmark.py
+++ b/python/pyarrow/benchmark.py
@@ -17,4 +17,6 @@
# flake8: noqa
+from __future__ import absolute_import
+
from pyarrow.lib import benchmark_PandasObjectIsNull
diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py
index 0549b16..e37307c 100644
--- a/python/pyarrow/compat.py
+++ b/python/pyarrow/compat.py
@@ -17,6 +17,8 @@
# flake8: noqa
+from __future__ import absolute_import
+
import itertools
import numpy as np
diff --git a/python/pyarrow/csv.py b/python/pyarrow/csv.py
index 8375ad4..62d9290 100644
--- a/python/pyarrow/csv.py
+++ b/python/pyarrow/csv.py
@@ -15,4 +15,6 @@
# specific language governing permissions and limitations
# under the License.
+from __future__ import absolute_import
+
from pyarrow._csv import ReadOptions, ParseOptions, ConvertOptions, read_csv # noqa
diff --git a/python/pyarrow/cuda.py b/python/pyarrow/cuda.py
index 29a217c..e4faa18 100644
--- a/python/pyarrow/cuda.py
+++ b/python/pyarrow/cuda.py
@@ -17,6 +17,8 @@
# flake8: noqa
+from __future__ import absolute_import
+
from pyarrow._cuda import (Context, IpcMemHandle, CudaBuffer,
HostBuffer, BufferReader, BufferWriter,
new_host_buffer,
diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
index 93bcada..91b77cb 100644
--- a/python/pyarrow/feather.py
+++ b/python/pyarrow/feather.py
@@ -15,6 +15,8 @@
# specific language governing permissions and limitations
# under the License.
+from __future__ import absolute_import
+
import os
import six
diff --git a/python/pyarrow/filesystem.py b/python/pyarrow/filesystem.py
index 98fb773..f941aa1 100644
--- a/python/pyarrow/filesystem.py
+++ b/python/pyarrow/filesystem.py
@@ -15,6 +15,8 @@
# specific language governing permissions and limitations
# under the License.
+from __future__ import absolute_import
+
import os
import inspect
import posixpath
diff --git a/python/pyarrow/flight.py b/python/pyarrow/flight.py
index 7d32778..37a21e4 100644
--- a/python/pyarrow/flight.py
+++ b/python/pyarrow/flight.py
@@ -15,6 +15,8 @@
# specific language governing permissions and limitations
# under the License.
+from __future__ import absolute_import
+
import sys
if sys.version_info < (3,):
diff --git a/python/pyarrow/gandiva.pyx b/python/pyarrow/gandiva.pyx
index 3904a8a..8f23aa1 100644
--- a/python/pyarrow/gandiva.pyx
+++ b/python/pyarrow/gandiva.pyx
@@ -20,6 +20,8 @@
# cython: embedsignature = True
# cython: language_level = 3
+from __future__ import absolute_import
+
from libcpp cimport bool as c_bool, nullptr
from libcpp.memory cimport shared_ptr, unique_ptr, make_shared
from libcpp.string cimport string as c_string
diff --git a/python/pyarrow/hdfs.py b/python/pyarrow/hdfs.py
index 3ddd3cd..9d33ac7 100644
--- a/python/pyarrow/hdfs.py
+++ b/python/pyarrow/hdfs.py
@@ -15,6 +15,8 @@
# specific language governing permissions and limitations
# under the License.
+from __future__ import absolute_import
+
import os
import posixpath
import sys
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 6656e73..8443c0c 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1045,6 +1045,37 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil:
CStatus Read(shared_ptr[CTable]* out)
+cdef extern from "arrow/json/options.h" nogil:
+
+ cdef cppclass CJSONReadOptions" arrow::json::ReadOptions":
+ c_bool use_threads
+ int32_t block_size
+
+ @staticmethod
+ CJSONReadOptions Defaults()
+
+ cdef cppclass CJSONParseOptions" arrow::json::ParseOptions":
+ shared_ptr[CSchema] explicit_schema
+ c_bool newlines_in_values
+
+ @staticmethod
+ CJSONParseOptions Defaults()
+
+
+cdef extern from "arrow/json/reader.h" namespace "arrow::json" nogil:
+
+ cdef cppclass CJSONReader" arrow::json::TableReader":
+ @staticmethod
+ CStatus Make(CMemoryPool*, shared_ptr[InputStream],
+ CJSONReadOptions, CJSONParseOptions,
+ shared_ptr[CJSONReader]* out)
+
+ CStatus Read(shared_ptr[CTable]* out)
+
+ cdef CStatus ParseOne(CJSONParseOptions options, shared_ptr[CBuffer] json,
+ shared_ptr[CRecordBatch]* out)
+
+
cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
cdef cppclass CFunctionContext" arrow::compute::FunctionContext":
diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py
index 78bb347..c162400 100644
--- a/python/pyarrow/ipc.py
+++ b/python/pyarrow/ipc.py
@@ -17,6 +17,8 @@
# Arrow file and stream reader/writer classes, and other messaging tools
+from __future__ import absolute_import
+
import pyarrow as pa
from pyarrow.lib import (Message, MessageReader, # noqa
diff --git a/python/pyarrow/benchmark.py b/python/pyarrow/json.py
similarity index 87%
copy from python/pyarrow/benchmark.py
copy to python/pyarrow/json.py
index ef1ef53..cfa2528 100644
--- a/python/pyarrow/benchmark.py
+++ b/python/pyarrow/json.py
@@ -15,6 +15,6 @@
# specific language governing permissions and limitations
# under the License.
-# flake8: noqa
+from __future__ import absolute_import
-from pyarrow.lib import benchmark_PandasObjectIsNull
+from pyarrow._json import ReadOptions, ParseOptions, read_json # noqa
diff --git a/python/pyarrow/jvm.py b/python/pyarrow/jvm.py
index 2341f40..9a59e10 100644
--- a/python/pyarrow/jvm.py
+++ b/python/pyarrow/jvm.py
@@ -25,6 +25,8 @@ through jpype. Modules that talk to a remote JVM like py4j will not work as the
memory addresses reported by them are not reachable in the python process.
"""
+from __future__ import absolute_import
+
import pyarrow as pa
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index cb5c732..998848d 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -17,6 +17,8 @@
# cython: language_level = 3
+from __future__ import absolute_import
+
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow cimport CStatus
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index 894ced5..783e2b2 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -19,6 +19,8 @@
# distutils: language = c++
# cython: embedsignature = True
+from __future__ import absolute_import
+
import datetime
import decimal as _pydecimal
import json
diff --git a/python/pyarrow/orc.py b/python/pyarrow/orc.py
index 39111e5..6c39407 100644
--- a/python/pyarrow/orc.py
+++ b/python/pyarrow/orc.py
@@ -15,6 +15,8 @@
# specific language governing permissions and limitations
# under the License.
+from __future__ import absolute_import
+
from itertools import count
from numbers import Integral
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index d90c8a2..8db97c0 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -15,6 +15,8 @@
# specific language governing permissions and limitations
# under the License.
+from __future__ import absolute_import
+
import ast
import json
import operator
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 78f7c0f..d44deee 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -15,6 +15,8 @@
# specific language governing permissions and limitations
# under the License.
+from __future__ import absolute_import
+
from collections import defaultdict
from concurrent import futures
from functools import partial
diff --git a/python/pyarrow/plasma.py b/python/pyarrow/plasma.py
index 13b3eec..748de97 100644
--- a/python/pyarrow/plasma.py
+++ b/python/pyarrow/plasma.py
@@ -15,6 +15,8 @@
# specific language governing permissions and limitations
# under the License.
+from __future__ import absolute_import
+
import contextlib
import os
import pyarrow as pa
diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py
index fe170b2..3a605a9 100644
--- a/python/pyarrow/serialization.py
+++ b/python/pyarrow/serialization.py
@@ -15,6 +15,8 @@
# specific language governing permissions and limitations
# under the License.
+from __future__ import absolute_import
+
import collections
import six
import sys
diff --git a/python/pyarrow/tests/test_json.py b/python/pyarrow/tests/test_json.py
new file mode 100644
index 0000000..7885455
--- /dev/null
+++ b/python/pyarrow/tests/test_json.py
@@ -0,0 +1,147 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import io
+import unittest
+
+import pytest
+
+import pyarrow as pa
+from pyarrow.json import read_json, ReadOptions, ParseOptions
+
+
+def test_read_options():
+ cls = ReadOptions
+ opts = cls()
+
+ assert opts.block_size > 0
+ opts.block_size = 12345
+ assert opts.block_size == 12345
+
+ assert opts.use_threads is True
+ opts.use_threads = False
+ assert opts.use_threads is False
+
+ opts = cls(block_size=1234, use_threads=False)
+ assert opts.block_size == 1234
+ assert opts.use_threads is False
+
+
+def test_parse_options():
+ cls = ParseOptions
+ opts = cls()
+ assert opts.newlines_in_values is False
+ assert opts.explicit_schema is None
+
+ opts.newlines_in_values = True
+ assert opts.newlines_in_values is True
+
+ schema = pa.schema([pa.field('foo', pa.int32())])
+ opts.explicit_schema = schema
+ assert opts.explicit_schema == schema
+
+
+class BaseTestJSONRead:
+
+ def read_bytes(self, b, **kwargs):
+ return self.read_json(pa.py_buffer(b), **kwargs)
+
+ def check_names(self, table, names):
+ assert table.num_columns == len(names)
+ assert [c.name for c in table.columns] == names
+
+ def test_file_object(self):
+ data = b'{"a": 1, "b": 2}\n'
+ expected_data = {'a': [1], 'b': [2]}
+ bio = io.BytesIO(data)
+ table = self.read_json(bio)
+ assert table.to_pydict() == expected_data
+ # Text files not allowed
+ sio = io.StringIO(data.decode())
+ with pytest.raises(TypeError):
+ self.read_json(sio)
+
+ def test_simple_ints(self):
+ # Infer integer columns
+ rows = b'{"a": 1,"b": 2, "c": 3}\n{"a": 4,"b": 5, "c": 6}\n'
+ table = self.read_bytes(rows)
+ schema = pa.schema([('a', pa.int64()),
+ ('b', pa.int64()),
+ ('c', pa.int64())])
+ assert table.schema == schema
+ assert table.to_pydict() == {
+ 'a': [1, 4],
+ 'b': [2, 5],
+ 'c': [3, 6],
+ }
+
+ def test_simple_varied(self):
+ # Infer various kinds of data
+ rows = (b'{"a": 1,"b": 2, "c": "3", "d": false}\n'
+ b'{"a": 4.0, "b": -5, "c": "foo", "d": true}\n')
+ table = self.read_bytes(rows)
+ schema = pa.schema([('a', pa.float64()),
+ ('b', pa.int64()),
+ ('c', pa.string()),
+ ('d', pa.bool_())])
+ assert table.schema == schema
+ assert table.to_pydict() == {
+ 'a': [1.0, 4.0],
+ 'b': [2, -5],
+ 'c': [u"3", u"foo"],
+ 'd': [False, True],
+ }
+
+ def test_simple_nulls(self):
+ # Infer various kinds of data, with nulls
+ rows = (b'{"a": 1, "b": 2, "c": null, "d": null, "e": null}\n'
+ b'{"a": null, "b": -5, "c": "foo", "d": null, "e": true}\n'
+ b'{"a": 4.5, "b": null, "c": "nan", "d": null,"e": false}\n')
+ table = self.read_bytes(rows)
+ schema = pa.schema([('a', pa.float64()),
+ ('b', pa.int64()),
+ ('c', pa.string()),
+ ('d', pa.null()),
+ ('e', pa.bool_())])
+ assert table.schema == schema
+ assert table.to_pydict() == {
+ 'a': [1.0, None, 4.5],
+ 'b': [2, -5, None],
+ 'c': [None, u"foo", u"nan"],
+ 'd': [None, None, None],
+ 'e': [None, True, False],
+ }
+
+
+class TestSerialJSONRead(BaseTestJSONRead, unittest.TestCase):
+
+ def read_json(self, *args, **kwargs):
+ read_options = kwargs.setdefault('read_options', ReadOptions())
+ read_options.use_threads = False
+ table = read_json(*args, **kwargs)
+ table._validate()
+ return table
+
+
+class TestParallelJSONRead(BaseTestJSONRead, unittest.TestCase):
+
+ def read_json(self, *args, **kwargs):
+ read_options = kwargs.setdefault('read_options', ReadOptions())
+ read_options.use_threads = True
+ table = read_json(*args, **kwargs)
+ table._validate()
+ return table
diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py
index 2bd7027..def1dde 100644
--- a/python/pyarrow/types.py
+++ b/python/pyarrow/types.py
@@ -17,6 +17,8 @@
# Tools for dealing with Arrow type metadata in Python
+from __future__ import absolute_import
+
from pyarrow.lib import (is_boolean_value, # noqa
is_integer_value,
is_float_value)
diff --git a/python/pyarrow/util.py b/python/pyarrow/util.py
index 6c17f5c..5e4fb35 100644
--- a/python/pyarrow/util.py
+++ b/python/pyarrow/util.py
@@ -17,6 +17,8 @@
# Miscellaneous utility code
+from __future__ import absolute_import
+
import functools
import six
import warnings
diff --git a/python/setup.py b/python/setup.py
index 8a6b4e1..88fcba2 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -169,6 +169,7 @@ class build_ext(_build_ext):
CYTHON_MODULE_NAMES = [
'lib',
'_csv',
+ '_json',
'_cuda',
'_flight',
'_parquet',