You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2020/05/19 07:34:12 UTC
[arrow] branch master updated: ARROW-8846: [Dev][Python] Autoformat
Python files with archery
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 1164079 ARROW-8846: [Dev][Python] Autoformat Python files with archery
1164079 is described below
commit 1164079d5442c3910c18549bfcd2e68d4554b909
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Tue May 19 09:33:39 2020 +0200
ARROW-8846: [Dev][Python] Autoformat Python files with archery
`archery lint --flake8` becomes `archery lint --python` and now recognizes the `--fix` option.
Reformatting involves running `autopep8`.
Closes #7215 from pitrou/ARROW-8846-archery-autopep8
Authored-by: Antoine Pitrou <an...@python.org>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
dev/archery/archery/cli.py | 3 +-
dev/archery/archery/lang/python.py | 11 ++-
dev/archery/archery/utils/command.py | 2 +-
dev/archery/archery/utils/lint.py | 66 ++++++++++++++---
dev/archery/archery/utils/rat.py | 2 +-
dev/archery/requirements-lint.txt | 3 +-
docs/source/developers/archery.rst | 84 ++++++++++++++++++++++
docs/source/developers/benchmarks.rst | 26 ++-----
docs/source/developers/cpp/development.rst | 6 +-
docs/source/developers/docker.rst | 30 ++------
docs/source/developers/documentation.rst | 8 ++-
docs/source/developers/integration.rst | 74 -------------------
docs/source/developers/python.rst | 20 ++----
docs/source/format/Integration.rst | 31 +-------
docs/source/index.rst | 2 +-
python/pyarrow/__init__.py | 6 +-
python/pyarrow/_cuda.pyx | 2 +
python/pyarrow/compat.py | 8 ++-
python/pyarrow/feather.py | 5 +-
python/pyarrow/includes/libarrow.pxd | 2 +
python/pyarrow/includes/libarrow_dataset.pxd | 4 ++
python/pyarrow/includes/libarrow_flight.pxd | 7 ++
python/pyarrow/includes/libarrow_fs.pxd | 2 +
python/pyarrow/io.pxi | 1 +
python/pyarrow/ipc.pxi | 1 +
python/pyarrow/ipc.py | 1 +
python/pyarrow/lib.pxd | 1 +
python/pyarrow/orc.py | 1 +
python/pyarrow/pandas_compat.py | 8 +--
python/pyarrow/parquet.py | 9 ++-
python/pyarrow/plasma.py | 2 +-
python/pyarrow/scalar.pxi | 1 +
python/pyarrow/serialization.py | 12 ++--
python/pyarrow/tensor.pxi | 8 ++-
python/pyarrow/tests/conftest.py | 2 +-
python/pyarrow/tests/strategies.py | 2 +-
python/pyarrow/tests/test_array.py | 4 +-
python/pyarrow/tests/test_csv.py | 66 ++++++++---------
python/pyarrow/tests/test_cython.py | 2 +-
python/pyarrow/tests/test_json.py | 8 +--
python/pyarrow/tests/test_orc.py | 12 ++--
python/pyarrow/tests/test_pandas.py | 102 +++++++++++++--------------
python/pyarrow/tests/test_parquet.py | 4 +-
python/pyarrow/tests/test_plasma.py | 14 ++--
python/pyarrow/tests/test_serialization.py | 6 +-
python/pyarrow/types.pxi | 3 +
python/setup.cfg | 4 ++
python/setup.py | 8 +--
48 files changed, 368 insertions(+), 318 deletions(-)
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index 7227af7..15c05a4 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -258,7 +258,8 @@ lint_checks = [
LintCheck('clang-tidy', "Lint C++ files with clang-tidy."),
LintCheck('cpplint', "Lint C++ files with cpplint."),
LintCheck('iwyu', "Lint changed C++ files with Include-What-You-Use."),
- LintCheck('flake8', "Lint Python files with flake8."),
+ LintCheck('python',
+ "Format and lint Python files with autopep8 and flake8."),
LintCheck('numpydoc', "Lint Python files with numpydoc."),
LintCheck('cmake-format', "Format CMake files with cmake-format.py."),
LintCheck('rat',
diff --git a/dev/archery/archery/lang/python.py b/dev/archery/archery/lang/python.py
index 223354e..9b79971 100644
--- a/dev/archery/archery/lang/python.py
+++ b/dev/archery/archery/lang/python.py
@@ -26,7 +26,7 @@ except ImportError:
else:
have_numpydoc = True
-from ..utils.command import Command, default_bin
+from ..utils.command import Command, capture_stdout, default_bin
class Flake8(Command):
@@ -34,6 +34,15 @@ class Flake8(Command):
self.bin = default_bin(flake8_bin, "flake8")
+class Autopep8(Command):
+ def __init__(self, autopep8_bin=None):
+ self.bin = default_bin(autopep8_bin, "autopep8")
+
+ @capture_stdout()
+ def run_captured(self, *args, **kwargs):
+ return self.run(*args, **kwargs)
+
+
def _tokenize_signature(s):
lines = s.encode('ascii').splitlines()
generator = iter(lines).__next__
diff --git a/dev/archery/archery/utils/command.py b/dev/archery/archery/utils/command.py
index 3ef6abe..2e27f08 100644
--- a/dev/archery/archery/utils/command.py
+++ b/dev/archery/archery/utils/command.py
@@ -25,7 +25,7 @@ from .logger import logger, ctx
def default_bin(name, default):
assert(default)
- env_name = "ARCHERY_%s_BIN".format(default.upper())
+ env_name = "ARCHERY_{0}_BIN".format(default.upper())
return name if name else os.environ.get(env_name, default)
diff --git a/dev/archery/archery/utils/lint.py b/dev/archery/archery/utils/lint.py
index d24f55b..0101147 100644
--- a/dev/archery/archery/utils/lint.py
+++ b/dev/archery/archery/utils/lint.py
@@ -17,6 +17,7 @@
import gzip
import os
+from pathlib import Path
import click
@@ -26,7 +27,7 @@ from .git import git
from .logger import logger
from ..lang.cpp import CppCMakeDefinition, CppConfiguration
from ..lang.rust import Cargo
-from ..lang.python import Flake8, NumpyDoc
+from ..lang.python import Autopep8, Flake8, NumpyDoc
from .rat import Rat, exclusion_from_globs
from .tmpdir import tmpdir
@@ -106,16 +107,61 @@ def cmake_linter(src, fix=False):
yield LintResult.from_cmd(cmake_format("--check"))
-def python_linter(src):
- """Run flake8 linter on python/pyarrow, and dev/. """
- logger.info("Running Python linters")
- flake8 = Flake8()
+def python_linter(src, fix=False):
+ """Run Python linters on python/pyarrow, python/examples, setup.py
+ and dev/. """
+ setup_py = os.path.join(src.python, "setup.py")
+ setup_cfg = os.path.join(src.python, "setup.cfg")
+
+ logger.info("Running Python formatter (autopep8)")
+
+ autopep8 = Autopep8()
+ if not autopep8.available:
+ logger.error(
+ "Python formatter requested but autopep8 binary not found. "
+ "Please run `pip install -r dev/archery/requirements-lint.txt`")
+ return
+ # Gather files for autopep8
+ patterns = ["python/pyarrow/**/*.py",
+ "python/pyarrow/**/*.pyx",
+ "python/pyarrow/**/*.pxd",
+ "python/pyarrow/**/*.pxi",
+ "python/examples/**/*.py",
+ "dev/archery/**/*.py",
+ ]
+ files = [setup_py]
+ for pattern in patterns:
+ files += list(map(str, Path(src.path).glob(pattern)))
+
+ args = ['--global-config', setup_cfg, '--ignore-local-config']
+ if fix:
+ args += ['-j0', '--in-place']
+ args += sorted(files)
+ yield LintResult.from_cmd(autopep8(*args))
+ else:
+ # XXX `-j0` doesn't work well with `--exit-code`, so instead
+ # we capture the diff and check whether it's empty
+ # (https://github.com/hhatto/autopep8/issues/543)
+ args += ['-j0', '--diff']
+ args += sorted(files)
+ diff = autopep8.run_captured(*args)
+ if diff:
+ print(diff.decode('utf8'))
+ yield LintResult(success=False)
+ else:
+ yield LintResult(success=True)
+
+ # Run flake8 after autopep8 (the latter may have modified some files)
+ logger.info("Running Python linter (flake8)")
+
+ flake8 = Flake8()
if not flake8.available:
- logger.error("Python linter requested but flake8 binary not found.")
+ logger.error(
+ "Python linter requested but flake8 binary not found. "
+ "Please run `pip install -r dev/archery/requirements-lint.txt`")
return
- setup_py = os.path.join(src.python, "setup.py")
yield LintResult.from_cmd(flake8(setup_py, src.pyarrow,
os.path.join(src.python, "examples"),
src.dev, check=False))
@@ -291,7 +337,7 @@ def docker_linter(src):
def linter(src, fix=False, *, clang_format=False, cpplint=False,
clang_tidy=False, iwyu=False, iwyu_all=False,
- flake8=False, numpydoc=False, cmake_format=False, rat=False,
+ python=False, numpydoc=False, cmake_format=False, rat=False,
r=False, rust=False, docker=False):
"""Run all linters."""
with tmpdir(prefix="arrow-lint-") as root:
@@ -311,8 +357,8 @@ def linter(src, fix=False, *, clang_format=False, cpplint=False,
iwyu_all=iwyu_all,
fix=fix))
- if flake8:
- results.extend(python_linter(src))
+ if python:
+ results.extend(python_linter(src, fix=fix))
if numpydoc:
results.extend(python_numpydoc())
diff --git a/dev/archery/archery/utils/rat.py b/dev/archery/archery/utils/rat.py
index ce78f9f..e7fe19a 100644
--- a/dev/archery/archery/utils/rat.py
+++ b/dev/archery/archery/utils/rat.py
@@ -36,7 +36,7 @@ class Rat(Jar):
@capture_stdout(strip=False)
def run_report(self, archive_path, **kwargs):
- return self.run("--xml", archive_path, **kwargs)
+ return self.run("--xml", archive_path, **kwargs)
def report(self, archive_path, **kwargs):
return RatReport(self.run_report(archive_path, **kwargs))
diff --git a/dev/archery/requirements-lint.txt b/dev/archery/requirements-lint.txt
index b1c02f6..fc7f339 100644
--- a/dev/archery/requirements-lint.txt
+++ b/dev/archery/requirements-lint.txt
@@ -1,2 +1,3 @@
+autopep8
flake8
-cmake_format==0.5.2
\ No newline at end of file
+cmake_format==0.5.2
diff --git a/docs/source/developers/archery.rst b/docs/source/developers/archery.rst
new file mode 100644
index 0000000..012dffb
--- /dev/null
+++ b/docs/source/developers/archery.rst
@@ -0,0 +1,84 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _archery:
+
+Daily Development using Archery
+===============================
+
+To ease some of the daily development tasks, we developed a Python-written
+utility called Archery.
+
+Installation
+------------
+
+Archery requires Python 3.5 or later. It is recommended to install archery in
+*editable* mode with the ``-e`` flag to automatically update the intallation
+when pulling the Arrow repository.
+
+.. code:: bash
+
+ pip install -e dev/archery
+
+Usage
+-----
+
+You can inspect Archery usage by passing the ``--help`` flag:
+
+.. code:: bash
+
+ $ archery --help
+ Usage: archery [OPTIONS] COMMAND [ARGS]...
+
+ Apache Arrow developer utilities.
+
+ See sub-commands help with `archery <cmd> --help`.
+
+ Options:
+ --debug Increase logging with debugging output.
+ --pdb Invoke pdb on uncaught exception.
+ -q, --quiet Silence executed commands.
+ --help Show this message and exit.
+
+ Commands:
+ benchmark Arrow benchmarking.
+ build Initialize an Arrow C++ build
+ docker Interact with docker-compose based builds.
+ integration Execute protocol and Flight integration tests
+ lint Check Arrow source tree for errors
+ numpydoc Lint python docstring with NumpyDoc
+ trigger-bot
+
+Archery exposes independent subcommands, each of which provides dedicated
+help output, for example:
+
+.. code:: bash
+
+ $ archery docker --help
+ Usage: archery docker [OPTIONS] COMMAND [ARGS]...
+
+ Interact with docker-compose based builds.
+
+ Options:
+ --src <arrow_src> Specify Arrow source directory.
+ --help Show this message and exit.
+
+ Commands:
+ images List the available docker-compose images.
+ push Push the generated docker-compose image.
+ run Execute docker-compose builds.
+
diff --git a/docs/source/developers/benchmarks.rst b/docs/source/developers/benchmarks.rst
index d85dc2b..31dcc76 100644
--- a/docs/source/developers/benchmarks.rst
+++ b/docs/source/developers/benchmarks.rst
@@ -17,30 +17,14 @@
.. _benchmarks:
-**********
+==========
Benchmarks
-**********
-
-Archery
-=======
-
-``archery`` is a python library and command line utility made to interact with
-Arrow's sources. The main feature is the benchmarking process.
-
-Installation
-~~~~~~~~~~~~
-
-The simplest way to install archery is with pip from the top-level directory.
-It is recommended to use the ``-e,--editable`` flag so that pip don't copy
-the module files but uses the actual sources.
-
-.. code-block:: shell
+==========
- pip install -e dev/archery
- archery --help
+Setup
+=====
- # optional: enable bash/zsh autocompletion
- eval "$(_ARCHERY_COMPLETE=source archery)"
+First install the :ref:`Archery <archery>` utility to run the benchmark suite.
Running the benchmark suite
===========================
diff --git a/docs/source/developers/cpp/development.rst b/docs/source/developers/cpp/development.rst
index f77abac..e8a3226 100644
--- a/docs/source/developers/cpp/development.rst
+++ b/docs/source/developers/cpp/development.rst
@@ -91,7 +91,8 @@ following checks:
compiler warnings with ``-DBUILD_WARNING_LEVEL=CHECKIN``. Note that
there are classes of warnings (such as ``-Wdocumentation``, see more
on this below) that are not caught by ``gcc``.
-* Passes various C++ (and others) style checks, checked with ``archery lint``
+* Passes various C++ (and others) style checks, checked with the ``lint``
+ subcommand to :ref:`Archery <archery>`.
* CMake files pass style checks, can be fixed by running
``run-cmake-format.py`` from the root of the repository. This requires Python
3 and `cmake_format <https://github.com/cheshirekow/cmake_format>`_ (note:
@@ -114,9 +115,6 @@ target that is executable from the root of the repository:
docker-compose run lint
-See :ref:`integration` for more information about the project's
-``docker-compose`` configuration.
-
Cleaning includes with include-what-you-use (IWYU)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/source/developers/docker.rst b/docs/source/developers/docker.rst
index 7bb4553..cdf77a7 100644
--- a/docs/source/developers/docker.rst
+++ b/docs/source/developers/docker.rst
@@ -15,37 +15,21 @@
.. specific language governing permissions and limitations
.. under the License.
+.. _docker-builds:
+
Running Docker Builds
=====================
-Most of our Linux based continuous integration tasks are decoupled from public
-CI services using docker and docker-compose. Keeping the CI configuration
+Most of our Linux based Continuous Integration tasks are decoupled from public
+CI services using `Docker <https://docs.docker.com/>`_ and
+`docker-compose <https://docs.docker.com/compose/>`_. Keeping the CI configuration
minimal makes local reproducibility possible.
Usage
-----
-There are multiple ways to execute the docker based builds. The recommended is
-to use the archery tool:
-
-Installation
-~~~~~~~~~~~~
-
-``archery`` requires ``python>=3.5``. It is recommended to install archery in
-``editable`` mode with the ``-e`` flag to automatically update the intallation
-by pulling the arrow repository.
-
-.. code:: bash
-
- pip install -e dev/archery[docker]
-
-For the available commands and options invoke the installed archery commands
-with the ``--help`` flag:
-
-.. code:: bash
-
- archery docker --help
- archery docker run --help
+There are multiple ways to execute the docker based builds.
+The recommended way is to use the :ref:`Archery <archery>` tool:
Examples
~~~~~~~~
diff --git a/docs/source/developers/documentation.rst b/docs/source/developers/documentation.rst
index 5878aa5..f024a1f 100644
--- a/docs/source/developers/documentation.rst
+++ b/docs/source/developers/documentation.rst
@@ -89,11 +89,15 @@ you made.
Building with Docker
--------------------
-You can use Archery to build the documentation within a docker container.
-For installation and usage see `Running Docker Builds`_ section.
+You can use :ref:`Archery <archery>` to build the documentation within a
+Docker container.
.. code-block:: shell
archery docker run ubuntu-docs
The final output is located under ``docs/_build/html``.
+
+.. seealso::
+
+ :ref:`docker_builds`.
diff --git a/docs/source/developers/integration.rst b/docs/source/developers/integration.rst
deleted file mode 100644
index e6ce3be..0000000
--- a/docs/source/developers/integration.rst
+++ /dev/null
@@ -1,74 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-.. _integration:
-
-Integration Testing
-===================
-
-Prerequisites
--------------
-
-Arrow uses `Docker <https://docs.docker.com/>`_ and
-`docker-compose <https://docs.docker.com/compose/>`_ for integration testing.
-You can follow the installation `instructions <https://docs.docker.com/compose/install/>`_.
-
-Docker images (services)
-------------------------
-
-The docker-compose services are defined in the ``docker-compose.yml`` file.
-Each service usually correspond to a language binding or an important service
-to test with Arrow.
-
-Services are configured with 2 local mounts, ``/arrow`` for the top-level
-source directory and ``/build`` for caching build artifacts. The source level
-directory mount can be paired with git checkout to test a specific commit. The
-build mount is used for caching and sharing state between staged images.
-
-- *c_glib*: Builds the GLib bindings
-- *cpp*: Builds the C++ project
-- *go*: Builds the go project
-- *java*: Builds the Java project
-- *js*: Builds the Javascript project
-- *python*: Builds the python bindings
-- *r*: Builds the R bindings
-- *rust*: Builds the rust project
-- *lint*: Run various lint on the C++ sources
-- *iwyu*: Run include-what-you-use on the C++ sources
-- *clang-format*: Run clang-format on the C++ sources, modifying in place
-- *clang-tidy*: Run clang-tidy on the C++ sources, outputting recommendations
-- *docs*: Builds this documentation
-
-You can build and run a service by using the `build` and `run` docker-compose
-sub-command, e.g. `docker-compose build python && docker-compose run python`.
-We do not publish the build images, you need to build them manually. This
-method requires the user to build the images in reverse dependency order.
-
-.. code-block:: shell
-
- # Build and run manually
- docker-compose build conda-cpp
- docker-compose build conda-python
- docker-compose run conda-python
-
-To simplify this, Archery provides a command for it:
-
-.. code-block:: shell
-
- archery docker run conda-python
-
-See `Running Docker Builds`_ for more details.
diff --git a/docs/source/developers/python.rst b/docs/source/developers/python.rst
index be5e9c6..75a51b4 100644
--- a/docs/source/developers/python.rst
+++ b/docs/source/developers/python.rst
@@ -18,9 +18,9 @@
.. currentmodule:: pyarrow
.. _python-development:
-******************
+==================
Python Development
-******************
+==================
This page provides general Python development guidelines and source build
instructions for all platforms.
@@ -29,22 +29,18 @@ Coding Style
============
We follow a similar PEP8-like coding style to the `pandas project
-<https://github.com/pandas-dev/pandas>`_.
-
-The code must pass ``flake8`` (available from pip or conda) or it will fail the
-build. Check for style errors before submitting your pull request with:
+<https://github.com/pandas-dev/pandas>`_. To check style issues, use the
+:ref:`Archery <archery>` subcommand ``lint``:
.. code-block:: shell
- flake8 .
- flake8 --config=.flake8.cython .
+ archery lint --python
-The package ``autopep8`` (also available from pip or conda) can automatically
-fix many of the errors reported by ``flake8``:
+Some of the issues can be automatically fixed by passing the ``--fix`` option:
.. code-block:: shell
- autopep8 --in-place --global-config=.flake8.cython pyarrow/table.pxi
+ archery lint --python --fix
Unit Testing
============
@@ -55,9 +51,7 @@ like so:
.. code-block:: shell
- pushd arrow/python
pytest pyarrow
- popd
Package requirements to run the unit tests are found in
``requirements-test.txt`` and can be installed if needed with ``pip install -r
diff --git a/docs/source/format/Integration.rst b/docs/source/format/Integration.rst
index 571d0fa..a3e4205 100644
--- a/docs/source/format/Integration.rst
+++ b/docs/source/format/Integration.rst
@@ -32,33 +32,8 @@ Our strategy for integration testing between Arrow implementations is:
Running integration tests
-------------------------
-The integration test data generator and runner uses ``archery``, a Python script
-that requires Python 3.6 or higher. You can create a standalone Python
-distribution and environment for running the tests by using
-`miniconda <https://conda.io/miniconda.html>`_. On Linux this is:
-
-.. code-block:: shell
-
- MINICONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
- wget -O miniconda.sh $MINICONDA_URL
- bash miniconda.sh -b -p miniconda
- export PATH=`pwd`/miniconda/bin:$PATH
-
- conda create -n arrow-integration python=3.6 nomkl numpy six
- conda activate arrow-integration
-
-
-If you are on macOS, instead use the URL:
-
-.. code-block:: shell
-
- MINICONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-
-Once you have Python, you can install archery
-
-.. code-block:: shell
-
- pip install -e dev/archery
+The integration test data generator and runner are implemented inside
+the :ref:`Archery <archery>` utility.
The integration tests are run using the ``archery integration`` command.
@@ -101,7 +76,7 @@ docker-compose. You may also run the docker-compose job locally, or at least
refer to it if you have questions about how to build other languages or enable
certain tests.
-See :ref:`integration` for more information about the project's
+See :ref:`docker-builds` for more information about the project's
``docker-compose`` configuration.
JSON test data format
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 021e2d5..f83d763 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -67,7 +67,7 @@ such topics as:
developers/contributing
developers/cpp/index
developers/python
- developers/integration
+ developers/archery
developers/crossbow
developers/docker
developers/benchmarks
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 9ae0d9c..e8b8c6e 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -40,6 +40,7 @@ except ImportError:
try:
import setuptools_scm
# Code duplicated from setup.py to avoid a dependency on each other
+
def parse_git(root, **kwargs):
"""
Parse function for setuptools_scm that ignores tags for non-C++
@@ -186,6 +187,7 @@ import pyarrow.types as types
# Entry point for starting the plasma store
+
def _plasma_store_entry_point():
"""Entry point for starting the plasma store.
@@ -202,6 +204,7 @@ def _plasma_store_entry_point():
# ----------------------------------------------------------------------
# Deprecations
+
from pyarrow.util import _deprecate_api # noqa
read_message = _deprecate_api("read_message", "ipc.read_message",
@@ -218,7 +221,7 @@ read_tensor = _deprecate_api("read_tensor", "ipc.read_tensor",
ipc.read_tensor, "0.17.0")
write_tensor = _deprecate_api("write_tensor", "ipc.write_tensor",
- ipc.write_tensor, "0.17.0")
+ ipc.write_tensor, "0.17.0")
get_record_batch_size = _deprecate_api("get_record_batch_size",
"ipc.get_record_batch_size",
@@ -243,6 +246,7 @@ from pyarrow.ipc import (Message, MessageReader,
# Returning absolute path to the pyarrow include directory (if bundled, e.g. in
# wheels)
+
def get_include():
"""
Return absolute path to directory containing Arrow C++ include
diff --git a/python/pyarrow/_cuda.pyx b/python/pyarrow/_cuda.pyx
index af9f422..c59b8dd 100644
--- a/python/pyarrow/_cuda.pyx
+++ b/python/pyarrow/_cuda.pyx
@@ -726,6 +726,7 @@ cdef class BufferReader(NativeFile):
may expect to be able to do anything other than pointer arithmetic
on the returned buffers.
"""
+
def __cinit__(self, CudaBuffer obj):
self.buffer = obj
self.reader = new CCudaBufferReader(self.buffer.buffer)
@@ -774,6 +775,7 @@ cdef class BufferWriter(NativeFile):
By default writes are unbuffered. Use set_buffer_size to enable
buffering.
"""
+
def __cinit__(self, CudaBuffer buffer):
self.buffer = buffer
self.writer = new CCudaBufferWriter(self.buffer.cuda_buffer)
diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py
index 22931a2..c2e7c32 100644
--- a/python/pyarrow/compat.py
+++ b/python/pyarrow/compat.py
@@ -30,16 +30,19 @@ except ImportError:
from collections.abc import Iterable, Mapping, Sequence
+
def guid():
from uuid import uuid4
return uuid4().hex
+
def tobytes(o):
if isinstance(o, str):
return o.encode('utf8')
else:
return o
+
def frombytes(o, *, safe=False):
if safe:
return o.decode('utf8', errors='replace')
@@ -60,6 +63,7 @@ try:
except ImportError:
pickle = builtin_pickle
+
def encode_file_path(path):
if isinstance(path, str):
# POSIX systems can handle utf-8. UTF8 is converted to utf16-le in
@@ -120,9 +124,9 @@ except ImportError:
names, formats, offsets = zip(*fields)
# names may be (title, names) tuples
- nametups = (n if isinstance(n, tuple) else (None, n) for n in names)
+ nametups = (n if isinstance(n, tuple) else (None, n) for n in names)
titles, names = zip(*nametups)
return np.dtype({'names': names, 'formats': formats, 'titles': titles,
- 'offsets': offsets, 'itemsize': offset})
+ 'offsets': offsets, 'itemsize': offset})
__all__ = []
diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
index 6071b5e..7b813af 100644
--- a/python/pyarrow/feather.py
+++ b/python/pyarrow/feather.py
@@ -40,6 +40,7 @@ class FeatherDataset:
validate_schema : bool, default True
Check that individual file schemas are all the same / compatible
"""
+
def __init__(self, path_or_paths, validate_schema=True):
_check_pandas_version()
self.paths = path_or_paths
@@ -142,8 +143,8 @@ def write_feather(df, dest, compression=None, compression_level=None,
"""
if _pandas_api.have_pandas:
_check_pandas_version()
- if (_pandas_api.has_sparse
- and isinstance(df, _pandas_api.pd.SparseDataFrame)):
+ if (_pandas_api.has_sparse and
+ isinstance(df, _pandas_api.pd.SparseDataFrame)):
df = df.to_dense()
if _pandas_api.is_data_frame(df):
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index f888a30..8e1c512 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1460,8 +1460,10 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
cdef cppclass CCastOptions" arrow::compute::CastOptions":
CCastOptions()
CCastOptions(c_bool safe)
+
@staticmethod
CCastOptions Safe()
+
@staticmethod
CCastOptions Unsafe()
c_bool allow_int_overflow
diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd
index a4731d7..0493892 100644
--- a/python/pyarrow/includes/libarrow_dataset.pxd
+++ b/python/pyarrow/includes/libarrow_dataset.pxd
@@ -96,6 +96,7 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
cdef cppclass CScanOptions "arrow::dataset::ScanOptions":
CRecordBatchProjector projector
+
@staticmethod
shared_ptr[CScanOptions] Make(shared_ptr[CSchema] schema)
@@ -260,6 +261,7 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
cdef cppclass CDirectoryPartitioning \
"arrow::dataset::DirectoryPartitioning"(CPartitioning):
CDirectoryPartitioning(shared_ptr[CSchema] schema)
+
@staticmethod
shared_ptr[CPartitioningFactory] MakeFactory(
vector[c_string] field_names)
@@ -267,6 +269,7 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
cdef cppclass CHivePartitioning \
"arrow::dataset::HivePartitioning"(CPartitioning):
CHivePartitioning(shared_ptr[CSchema] schema)
+
@staticmethod
shared_ptr[CPartitioningFactory] MakeFactory()
@@ -302,6 +305,7 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
shared_ptr[CFileFormat] format,
CFileSystemFactoryOptions options
)
+
@staticmethod
CResult[shared_ptr[CDatasetFactory]] MakeFromSelector "Make"(
shared_ptr[CFileSystem] filesystem,
diff --git a/python/pyarrow/includes/libarrow_flight.pxd b/python/pyarrow/includes/libarrow_flight.pxd
index 0dba924..cd5f9d0 100644
--- a/python/pyarrow/includes/libarrow_flight.pxd
+++ b/python/pyarrow/includes/libarrow_flight.pxd
@@ -66,6 +66,7 @@ cdef extern from "arrow/flight/api.h" namespace "arrow" nogil:
c_string cmd
vector[c_string] path
CStatus SerializeToString(c_string* out)
+
@staticmethod
CStatus Deserialize(const c_string& serialized,
CFlightDescriptor* out)
@@ -76,6 +77,7 @@ cdef extern from "arrow/flight/api.h" namespace "arrow" nogil:
c_string ticket
bint operator==(CTicket)
CStatus SerializeToString(c_string* out)
+
@staticmethod
CStatus Deserialize(const c_string& serialized, CTicket* out)
@@ -90,10 +92,13 @@ cdef extern from "arrow/flight/api.h" namespace "arrow" nogil:
@staticmethod
CStatus Parse(c_string& uri_string, CLocation* location)
+
@staticmethod
CStatus ForGrpcTcp(c_string& host, int port, CLocation* location)
+
@staticmethod
CStatus ForGrpcTls(c_string& host, int port, CLocation* location)
+
@staticmethod
CStatus ForGrpcUnix(c_string& path, CLocation* location)
@@ -113,6 +118,7 @@ cdef extern from "arrow/flight/api.h" namespace "arrow" nogil:
CFlightDescriptor& descriptor()
const vector[CFlightEndpoint]& endpoints()
CStatus SerializeToString(c_string* out)
+
@staticmethod
CStatus Deserialize(const c_string& serialized,
unique_ptr[CFlightInfo]* out)
@@ -327,6 +333,7 @@ cdef extern from "arrow/flight/api.h" namespace "arrow" nogil:
cdef cppclass FlightStatusDetail" arrow::flight::FlightStatusDetail":
CFlightStatusCode code()
c_string extra_info()
+
@staticmethod
shared_ptr[FlightStatusDetail] UnwrapStatus(const CStatus& status)
diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd
index 3483673..a794753 100644
--- a/python/pyarrow/includes/libarrow_fs.pxd
+++ b/python/pyarrow/includes/libarrow_fs.pxd
@@ -132,6 +132,7 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil:
@staticmethod
CS3Options Defaults()
+
@staticmethod
CS3Options FromAccessKey(const c_string& access_key,
const c_string& secret_key)
@@ -150,6 +151,7 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil:
int32_t buffer_size
int16_t replication
int64_t default_block_size
+
@staticmethod
CResult[CHdfsOptions] FromUriString "FromUri"(
const c_string& uri_string)
diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi
index 198bfb7..b1032d6 100644
--- a/python/pyarrow/io.pxi
+++ b/python/pyarrow/io.pxi
@@ -1194,6 +1194,7 @@ cdef class CompressedInputStream(NativeFile):
compression : str
The compression type ("bz2", "brotli", "gzip", "lz4" or "zstd").
"""
+
def __init__(self, NativeFile stream, str compression not None):
cdef:
Codec codec = Codec(compression)
diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi
index 1b395b3..a91eabd 100644
--- a/python/pyarrow/ipc.pxi
+++ b/python/pyarrow/ipc.pxi
@@ -22,6 +22,7 @@ cdef class Message:
"""
Container for an Arrow IPC message with metadata and optional body
"""
+
def __cinit__(self):
pass
diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py
index f76969b..fbbf98a 100644
--- a/python/pyarrow/ipc.py
+++ b/python/pyarrow/ipc.py
@@ -96,6 +96,7 @@ class RecordBatchFileReader(lib._RecordBatchFileReader, _ReadPandasOption):
If the file is embedded in some larger file, this is the byte offset to
the very end of the file data
"""
+
def __init__(self, source, footer_offset=None):
self._open(source, footer_offset=footer_offset)
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index da6d28c..e629c0e 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -154,6 +154,7 @@ cdef class KeyValueMetadata(_Metadata):
const CKeyValueMetadata* metadata
cdef void init(self, const shared_ptr[const CKeyValueMetadata]& wrapped)
+
@staticmethod
cdef wrap(const shared_ptr[const CKeyValueMetadata]& sp)
cdef inline shared_ptr[const CKeyValueMetadata] unwrap(self) nogil
diff --git a/python/pyarrow/orc.py b/python/pyarrow/orc.py
index f335ce7..d1f0925 100644
--- a/python/pyarrow/orc.py
+++ b/python/pyarrow/orc.py
@@ -68,6 +68,7 @@ class ORCFile:
Readable source. For passing Python file objects or byte buffers,
see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
"""
+
def __init__(self, source):
self.reader = _orc.ORCReader()
self.reader.open(source)
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 55c38c9..a2a461f 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -363,8 +363,8 @@ def _get_columns_to_convert(df, schema, preserve_index, columns):
index_column_names = []
for i, index_level in enumerate(index_levels):
name = _index_level_name(index_level, i, column_names)
- if (isinstance(index_level, _pandas_api.pd.RangeIndex)
- and preserve_index is None):
+ if (isinstance(index_level, _pandas_api.pd.RangeIndex) and
+ preserve_index is None):
descr = _get_range_index_descriptor(index_level)
else:
columns_to_convert.append(index_level)
@@ -773,8 +773,8 @@ def table_to_blockmanager(options, table, categories=None,
# dataframe (complex not included since not supported by Arrow)
_pandas_supported_numpy_types = {
str(np.dtype(typ))
- for typ in (np.sctypes['int'] + np.sctypes['uint'] + np.sctypes['float']
- + ['object', 'bool'])
+ for typ in (np.sctypes['int'] + np.sctypes['uint'] + np.sctypes['float'] +
+ ['object', 'bool'])
}
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 9da33b1..51542ee 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -93,9 +93,9 @@ def _check_filters(filters, check_null_strings=True):
for conjunction in filters:
for col, op, val in conjunction:
if (
- isinstance(val, list)
- and all(_check_contains_null(v) for v in val)
- or _check_contains_null(val)
+ isinstance(val, list) and
+ all(_check_contains_null(v) for v in val) or
+ _check_contains_null(val)
):
raise NotImplementedError(
"Null-terminated binary strings are not supported "
@@ -192,6 +192,7 @@ class ParquetFile:
If positive, perform read buffering when deserializing individual
column chunks. Otherwise IO calls are unbuffered.
"""
+
def __init__(self, source, metadata=None, common_metadata=None,
read_dictionary=None, memory_map=False, buffer_size=0):
self.reader = ParquetReader()
@@ -619,6 +620,7 @@ class ParquetDatasetPiece:
row_group : int, default None
Row group to load. By default, reads all row groups.
"""
+
def __init__(self, path, open_file_func=partial(open, mode='rb'),
file_options=None, row_group=None, partition_keys=None):
self.path = _stringify_path(path)
@@ -1366,6 +1368,7 @@ class _ParquetDatasetV2:
"""
ParquetDataset shim using the Dataset API under the hood.
"""
+
def __init__(self, path_or_paths, filesystem=None, filters=None,
partitioning="hive", read_dictionary=None, buffer_size=None,
memory_map=False, **kwargs):
diff --git a/python/pyarrow/plasma.py b/python/pyarrow/plasma.py
index 251b9db..a4bf79b 100644
--- a/python/pyarrow/plasma.py
+++ b/python/pyarrow/plasma.py
@@ -25,7 +25,7 @@ import sys
import tempfile
import time
-from pyarrow._plasma import (ObjectID, ObjectNotAvailable, # noqa
+from pyarrow._plasma import (ObjectID, ObjectNotAvailable, # noqa
PlasmaBuffer, PlasmaClient, connect,
PlasmaObjectExists, PlasmaObjectNotFound,
PlasmaStoreFull)
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 3538483..64e7412 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -30,6 +30,7 @@ cdef class NullType(Scalar):
Singleton for null array elements.
"""
# TODO rename this NullValue?
+
def __cinit__(self):
global NA
if NA is not None:
diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py
index fb82f64..55d7260 100644
--- a/python/pyarrow/serialization.py
+++ b/python/pyarrow/serialization.py
@@ -150,8 +150,8 @@ def _register_custom_pandas_handlers(context):
)
def _serialize_pandas_dataframe(obj):
- if (pdcompat._pandas_api.has_sparse
- and isinstance(obj, pd.SparseDataFrame)):
+ if (pdcompat._pandas_api.has_sparse and
+ isinstance(obj, pd.SparseDataFrame)):
raise NotImplementedError(
sparse_type_error_msg.format('SparseDataFrame')
)
@@ -162,8 +162,8 @@ def _register_custom_pandas_handlers(context):
return pdcompat.serialized_dict_to_dataframe(data)
def _serialize_pandas_series(obj):
- if (pdcompat._pandas_api.has_sparse
- and isinstance(obj, pd.SparseSeries)):
+ if (pdcompat._pandas_api.has_sparse and
+ isinstance(obj, pd.SparseSeries)):
raise NotImplementedError(
sparse_type_error_msg.format('SparseSeries')
)
@@ -302,7 +302,7 @@ def _register_collections_serialization_handlers(serialization_context):
def _register_scipy_handlers(serialization_context):
try:
from scipy.sparse import (csr_matrix, csc_matrix, coo_matrix,
- isspmatrix_coo, isspmatrix_csr,
+ isspmatrix_coo, isspmatrix_csr,
isspmatrix_csc, isspmatrix)
def _serialize_scipy_sparse(obj):
@@ -320,7 +320,7 @@ def _register_scipy_handlers(serialization_context):
else:
raise NotImplementedError(
- "Serialization of {} is not supported.".format(obj[0]))
+ "Serialization of {} is not supported.".format(obj[0]))
def _deserialize_scipy_sparse(data):
if data[0] == 'coo':
diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi
index f74e3c8..edea4ef 100644
--- a/python/pyarrow/tensor.pxi
+++ b/python/pyarrow/tensor.pxi
@@ -605,8 +605,9 @@ shape: {0.shape}""".format(self)
indices = np.require(obj.indices, dtype='i8')
check_status(NdarraysToSparseCSCMatrix(c_default_memory_pool(),
- obj.data, indptr, indices, c_shape,
- c_dim_names, &csparse_tensor))
+ obj.data, indptr, indices,
+ c_shape, c_dim_names,
+ &csparse_tensor))
return pyarrow_wrap_sparse_csc_matrix(csparse_tensor)
@staticmethod
@@ -646,7 +647,8 @@ shape: {0.shape}""".format(self)
cdef PyObject* out_indices
check_status(SparseCSCMatrixToNdarray(self.sp_sparse_tensor, self,
- &out_data, &out_indptr, &out_indices))
+ &out_data, &out_indptr,
+ &out_indices))
data = PyObject_to_object(out_data)
indptr = PyObject_to_object(out_indptr)
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index ce72f2e..6b2ca56 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -104,7 +104,7 @@ except ImportError:
pass
try:
- import pyarrow.orc # noqa
+ import pyarrow.orc # noqa
defaults['orc'] = True
except ImportError:
pass
diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py
index 2d58080..088f291 100644
--- a/python/pyarrow/tests/strategies.py
+++ b/python/pyarrow/tests/strategies.py
@@ -110,7 +110,7 @@ def list_types(item_strategy=primitive_types):
return (
st.builds(pa.list_, item_strategy) |
st.builds(pa.large_list, item_strategy)
- )
+ )
def struct_types(item_strategy=primitive_types):
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 3785d0e..dd9e9e6 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -1208,7 +1208,7 @@ def test_cast_from_null():
pa.struct([pa.field('a', pa.int32()),
pa.field('b', pa.list_(pa.int8())),
pa.field('c', pa.string())]),
- ]
+ ]
for out_type in out_types:
_check_cast_case((in_data, in_type, in_data, out_type))
@@ -1218,7 +1218,7 @@ def test_cast_from_null():
pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
pa.union([pa.field('a', pa.binary(10)),
pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
- ]
+ ]
in_arr = pa.array(in_data, type=pa.null())
for out_type in out_types:
with pytest.raises(NotImplementedError):
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index 0992f14..abc9d31 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -234,7 +234,7 @@ class BaseTestCSVRead:
assert table.to_pydict() == {
"ef": ["ij", "mn"],
"gh": ["kl", "op"],
- }
+ }
opts.skip_rows = 3
table = self.read_bytes(rows, read_options=opts)
@@ -242,7 +242,7 @@ class BaseTestCSVRead:
assert table.to_pydict() == {
"mn": [],
"op": [],
- }
+ }
opts.skip_rows = 4
with pytest.raises(pa.ArrowInvalid):
@@ -257,7 +257,7 @@ class BaseTestCSVRead:
assert table.to_pydict() == {
"ij": ["mn"],
"kl": ["op"],
- }
+ }
def test_header_column_names(self):
rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n"
@@ -269,7 +269,7 @@ class BaseTestCSVRead:
assert table.to_pydict() == {
"x": ["ab", "ef", "ij", "mn"],
"y": ["cd", "gh", "kl", "op"],
- }
+ }
opts.skip_rows = 3
table = self.read_bytes(rows, read_options=opts)
@@ -277,7 +277,7 @@ class BaseTestCSVRead:
assert table.to_pydict() == {
"x": ["mn"],
"y": ["op"],
- }
+ }
opts.skip_rows = 4
table = self.read_bytes(rows, read_options=opts)
@@ -285,7 +285,7 @@ class BaseTestCSVRead:
assert table.to_pydict() == {
"x": [],
"y": [],
- }
+ }
opts.skip_rows = 5
with pytest.raises(pa.ArrowInvalid):
@@ -308,7 +308,7 @@ class BaseTestCSVRead:
assert table.to_pydict() == {
"x": ["ij", "mn"],
"y": ["kl", "op"],
- }
+ }
def test_header_autogenerate_column_names(self):
rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n"
@@ -320,7 +320,7 @@ class BaseTestCSVRead:
assert table.to_pydict() == {
"f0": ["ab", "ef", "ij", "mn"],
"f1": ["cd", "gh", "kl", "op"],
- }
+ }
opts.skip_rows = 3
table = self.read_bytes(rows, read_options=opts)
@@ -328,7 +328,7 @@ class BaseTestCSVRead:
assert table.to_pydict() == {
"f0": ["mn"],
"f1": ["op"],
- }
+ }
# Not enough rows, impossible to infer number of columns
opts.skip_rows = 4
@@ -344,7 +344,7 @@ class BaseTestCSVRead:
self.check_names(table, ["ab"])
assert table.to_pydict() == {
"ab": ["ef", "ij", "mn"],
- }
+ }
# Order of include_columns is respected, regardless of CSV order
convert_options.include_columns = ['cd', 'ab']
@@ -355,7 +355,7 @@ class BaseTestCSVRead:
assert table.to_pydict() == {
"cd": ["gh", "kl", "op"],
"ab": ["ef", "ij", "mn"],
- }
+ }
# Include a column not in the CSV file => raises by default
convert_options.include_columns = ['xx', 'ab', 'yy']
@@ -381,7 +381,7 @@ class BaseTestCSVRead:
"xx": [None, None, None],
"ab": ["ef", "ij", "mn"],
"yy": [None, None, None],
- }
+ }
# Combining with `column_names`
read_options.column_names = ["xx", "yy"]
@@ -394,7 +394,7 @@ class BaseTestCSVRead:
assert table.to_pydict() == {
"yy": ["cd", "gh", "kl", "op"],
"cd": [None, None, None, None],
- }
+ }
# And with `column_types` as well
convert_options.column_types = {"yy": pa.binary(),
@@ -407,7 +407,7 @@ class BaseTestCSVRead:
assert table.to_pydict() == {
"yy": [b"cd", b"gh", b"kl", b"op"],
"cd": [None, None, None, None],
- }
+ }
def test_simple_ints(self):
# Infer integer columns
@@ -421,7 +421,7 @@ class BaseTestCSVRead:
'a': [1, 4],
'b': [2, 5],
'c': [3, 6],
- }
+ }
def test_simple_varied(self):
# Infer various kinds of data
@@ -437,7 +437,7 @@ class BaseTestCSVRead:
'b': [2, -5],
'c': ["3", "foo"],
'd': [False, True],
- }
+ }
def test_simple_nulls(self):
# Infer various kinds of data, with nulls
@@ -460,7 +460,7 @@ class BaseTestCSVRead:
'd': [None, None, None],
'e': [b"3", b"nan", b"\xff"],
'f': [None, True, False],
- }
+ }
def test_simple_timestamps(self):
# Infer a timestamp column
@@ -472,7 +472,7 @@ class BaseTestCSVRead:
assert table.to_pydict() == {
'a': [1970, 1989],
'b': [datetime(1970, 1, 1), datetime(1989, 7, 14)],
- }
+ }
def test_auto_dict_encode(self):
opts = ConvertOptions(auto_dict_encode=True)
@@ -483,7 +483,7 @@ class BaseTestCSVRead:
expected = {
'a': ["ab", "cdé", "cdé", "ab"],
'b': [1, 2, 3, 4],
- }
+ }
assert table.schema == schema
assert table.to_pydict() == expected
@@ -518,7 +518,7 @@ class BaseTestCSVRead:
expected = {
'a': [b"ab", b"cd\xff", b"ab"],
'b': [1, 2, 3],
- }
+ }
assert table.schema == schema
assert table.to_pydict() == expected
@@ -537,7 +537,7 @@ class BaseTestCSVRead:
'b': ["Xxx", "#N/A"],
'c': ["1", ""],
'd': [2, None],
- }
+ }
opts = ConvertOptions(null_values=['Xxx', 'Zzz'],
strings_can_be_null=True)
@@ -547,7 +547,7 @@ class BaseTestCSVRead:
'b': [None, "#N/A"],
'c': ["1", ""],
'd': [2, None],
- }
+ }
opts = ConvertOptions(null_values=[])
rows = b"a,b\n#N/A,\n"
@@ -558,7 +558,7 @@ class BaseTestCSVRead:
assert table.to_pydict() == {
'a': ["#N/A"],
'b': [""],
- }
+ }
def test_custom_bools(self):
# Infer booleans with custom values
@@ -579,7 +579,7 @@ class BaseTestCSVRead:
'a': ["True", "False", "True", "False", "N/A"],
'b': [True, False, True, False, None],
'c': ["t", "f", "yes", "no", "N/A"],
- }
+ }
def test_column_types(self):
# Ask for specific column types in ConvertOptions
@@ -601,7 +601,7 @@ class BaseTestCSVRead:
'c': ["3", "6"],
'd': [True, False],
'e': [Decimal("1.00"), Decimal("0.00")]
- }
+ }
assert table.schema == schema
assert table.to_pydict() == expected
# Pass column_types as schema
@@ -636,7 +636,7 @@ class BaseTestCSVRead:
assert table.to_pydict() == {
'x': [b'a', b'c', b'e'],
'y': ['b', 'd', 'f'],
- }
+ }
def test_no_ending_newline(self):
# No \n after last line
@@ -646,7 +646,7 @@ class BaseTestCSVRead:
'a': [1, 4],
'b': [2, 5],
'c': [3, 6],
- }
+ }
def test_trivial(self):
# A bit pointless, but at least it shouldn't crash
@@ -660,20 +660,20 @@ class BaseTestCSVRead:
assert table.to_pydict() == {
'a': [1, 3],
'b': [2, 4],
- }
+ }
parse_options = ParseOptions(ignore_empty_lines=False)
table = self.read_bytes(rows, parse_options=parse_options)
assert table.to_pydict() == {
'a': [None, 1, None, 3],
'b': [None, 2, None, 4],
- }
+ }
read_options = ReadOptions(skip_rows=2)
table = self.read_bytes(rows, parse_options=parse_options,
read_options=read_options)
assert table.to_pydict() == {
'1': [None, 3],
'2': [None, 4],
- }
+ }
def test_invalid_csv(self):
# Various CSV errors
@@ -693,13 +693,13 @@ class BaseTestCSVRead:
assert table.to_pydict() == {
'a;b': ['de'],
'c': ['fg;eh'],
- }
+ }
opts = ParseOptions(delimiter=';')
table = self.read_bytes(rows, parse_options=opts)
assert table.to_pydict() == {
'a': ['de,fg'],
'b,c': ['eh'],
- }
+ }
def test_small_random_csv(self):
csv, expected = make_random_csv(num_cols=2, num_rows=10)
@@ -1082,7 +1082,7 @@ class TestGZipCSVRead(BaseTestCompressedCSVRead, unittest.TestCase):
assert table.to_pydict() == {
'ab': ['ef', 'ij', 'mn'],
'cd': ['gh', 'kl', 'op'],
- }
+ }
class TestBZ2CSVRead(BaseTestCompressedCSVRead, unittest.TestCase):
diff --git a/python/pyarrow/tests/test_cython.py b/python/pyarrow/tests/test_cython.py
index 202868d..30fd806 100644
--- a/python/pyarrow/tests/test_cython.py
+++ b/python/pyarrow/tests/test_cython.py
@@ -116,7 +116,7 @@ def test_cython_api(tmpdir):
arr = mod.make_null_array(5)
assert mod.get_array_length(arr) == 5
assert arr.null_count == 5
- """.format(mod_path=str(tmpdir), mod_name='pyarrow_cython_example')
+ """.format(mod_name='pyarrow_cython_example')
if sys.platform == 'win32':
delim, var = ';', 'PATH'
diff --git a/python/pyarrow/tests/test_json.py b/python/pyarrow/tests/test_json.py
index d373034..cfae932 100644
--- a/python/pyarrow/tests/test_json.py
+++ b/python/pyarrow/tests/test_json.py
@@ -131,7 +131,7 @@ class BaseTestJSONRead:
'a': [1, 4],
'b': [2, 5],
'c': [3, 6],
- }
+ }
def test_simple_ints(self):
# Infer integer columns
@@ -145,7 +145,7 @@ class BaseTestJSONRead:
'a': [1, 4],
'b': [2, 5],
'c': [3, 6],
- }
+ }
def test_simple_varied(self):
# Infer various kinds of data
@@ -162,7 +162,7 @@ class BaseTestJSONRead:
'b': [2, -5],
'c': ["3", "foo"],
'd': [False, True],
- }
+ }
def test_simple_nulls(self):
# Infer various kinds of data, with nulls
@@ -182,7 +182,7 @@ class BaseTestJSONRead:
'c': [None, "foo", "nan"],
'd': [None, None, None],
'e': [None, True, False],
- }
+ }
def test_small_random_json(self):
data, expected = make_random_json(num_cols=2, num_rows=10)
diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py
index 44e98a0..cc75886 100644
--- a/python/pyarrow/tests/test_orc.py
+++ b/python/pyarrow/tests/test_orc.py
@@ -148,18 +148,18 @@ def test_orcfile_empty(datadir):
('list', pa.list_(pa.struct([
('int1', pa.int32()),
('string1', pa.string()),
- ]))),
- ])),
+ ]))),
+ ])),
('list', pa.list_(pa.struct([
('int1', pa.int32()),
('string1', pa.string()),
- ]))),
+ ]))),
('map', pa.list_(pa.struct([
('key', pa.string()),
('value', pa.struct([
('int1', pa.int32()),
('string1', pa.string()),
- ])),
- ]))),
- ])
+ ])),
+ ]))),
+ ])
assert table.schema == expected_schema
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index a4bb37f..3a07ffa 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -1302,35 +1302,35 @@ class TestConvertDateTimeLikeTypes:
def test_numpy_datetime64_columns(self):
datetime64_ns = np.array([
- '2007-07-13T01:23:34.123456789',
- None,
- '2006-01-13T12:34:56.432539784',
- '2010-08-13T05:46:57.437699912'],
- dtype='datetime64[ns]')
+ '2007-07-13T01:23:34.123456789',
+ None,
+ '2006-01-13T12:34:56.432539784',
+ '2010-08-13T05:46:57.437699912'],
+ dtype='datetime64[ns]')
_check_array_from_pandas_roundtrip(datetime64_ns)
datetime64_us = np.array([
- '2007-07-13T01:23:34.123456',
- None,
- '2006-01-13T12:34:56.432539',
- '2010-08-13T05:46:57.437699'],
- dtype='datetime64[us]')
+ '2007-07-13T01:23:34.123456',
+ None,
+ '2006-01-13T12:34:56.432539',
+ '2010-08-13T05:46:57.437699'],
+ dtype='datetime64[us]')
_check_array_from_pandas_roundtrip(datetime64_us)
datetime64_ms = np.array([
- '2007-07-13T01:23:34.123',
- None,
- '2006-01-13T12:34:56.432',
- '2010-08-13T05:46:57.437'],
- dtype='datetime64[ms]')
+ '2007-07-13T01:23:34.123',
+ None,
+ '2006-01-13T12:34:56.432',
+ '2010-08-13T05:46:57.437'],
+ dtype='datetime64[ms]')
_check_array_from_pandas_roundtrip(datetime64_ms)
datetime64_s = np.array([
- '2007-07-13T01:23:34',
- None,
- '2006-01-13T12:34:56',
- '2010-08-13T05:46:57'],
- dtype='datetime64[s]')
+ '2007-07-13T01:23:34',
+ None,
+ '2006-01-13T12:34:56',
+ '2010-08-13T05:46:57'],
+ dtype='datetime64[s]')
_check_array_from_pandas_roundtrip(datetime64_s)
def test_timestamp_to_pandas_ns(self):
@@ -1378,11 +1378,11 @@ class TestConvertDateTimeLikeTypes:
@pytest.mark.parametrize('dtype', [pa.date32(), pa.date64()])
def test_numpy_datetime64_day_unit(self, dtype):
datetime64_d = np.array([
- '2007-07-13',
- None,
- '2006-01-15',
- '2010-08-19'],
- dtype='datetime64[D]')
+ '2007-07-13',
+ None,
+ '2006-01-15',
+ '2010-08-19'],
+ dtype='datetime64[D]')
_check_array_from_pandas_roundtrip(datetime64_d, type=dtype)
def test_array_from_pandas_date_with_mask(self):
@@ -1403,8 +1403,8 @@ class TestConvertDateTimeLikeTypes:
'a': [
pd.Timestamp('2012-11-11 00:00:00+01:00'),
pd.NaT
- ]
- })
+ ]
+ })
_check_pandas_roundtrip(df)
_check_serialize_components_roundtrip(df)
@@ -1774,14 +1774,14 @@ class TestConvertListTypes:
def test_column_of_decimal_list(self):
array = pa.array([[decimal.Decimal('1'), decimal.Decimal('2')],
- [decimal.Decimal('3.3')]],
+ [decimal.Decimal('3.3')]],
type=pa.list_(pa.decimal128(2, 1)))
table = pa.Table.from_arrays([array], names=['col1'])
df = table.to_pandas()
expected_df = pd.DataFrame(
- {'col1': [[decimal.Decimal('1'), decimal.Decimal('2')],
- [decimal.Decimal('3.3')]]})
+ {'col1': [[decimal.Decimal('1'), decimal.Decimal('2')],
+ [decimal.Decimal('3.3')]]})
tm.assert_frame_equal(df, expected_df)
def test_nested_types_from_ndarray_null_entries(self):
@@ -3910,32 +3910,32 @@ def test_metadata_compat_missing_field_name():
# metadata generated by fastparquet 0.3.2 with missing field_names
table = table.replace_schema_metadata({
- b'pandas': json.dumps(
- {'column_indexes': [
+ b'pandas': json.dumps({
+ 'column_indexes': [
{'field_name': None,
'metadata': None,
'name': None,
'numpy_type': 'object',
'pandas_type': 'mixed-integer'}
- ],
- 'columns': [
- {'metadata': None,
- 'name': 'a',
- 'numpy_type': 'int64',
- 'pandas_type': 'int64'},
- {'metadata': None,
- 'name': 'b',
- 'numpy_type': 'object',
- 'pandas_type': 'unicode'}
- ],
- 'index_columns': [
- {'kind': 'range',
- 'name': 'qux',
- 'start': 0,
- 'step': 2,
- 'stop': 8}
- ],
- 'pandas_version': '0.25.0'}
+ ],
+ 'columns': [
+ {'metadata': None,
+ 'name': 'a',
+ 'numpy_type': 'int64',
+ 'pandas_type': 'int64'},
+ {'metadata': None,
+ 'name': 'b',
+ 'numpy_type': 'object',
+ 'pandas_type': 'unicode'}
+ ],
+ 'index_columns': [
+ {'kind': 'range',
+ 'name': 'qux',
+ 'start': 0,
+ 'step': 2,
+ 'stop': 8}
+ ],
+ 'pandas_version': '0.25.0'}
)})
result = table.to_pandas()
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index e146c08..6c7b6d4 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -341,8 +341,8 @@ def test_nested_list_nonnullable_roundtrip_bug(use_legacy_dataset):
typ = pa.list_(pa.field("item", pa.float32(), False))
num_rows = 10000
t = pa.table([
- pa.array(([[0] * ((i + 5) % 10) for i in range(0, 10)]
- * (num_rows // 10)), type=typ)
+ pa.array(([[0] * ((i + 5) % 10) for i in range(0, 10)] *
+ (num_rows // 10)), type=typ)
], ['a'])
_check_roundtrip(
t, data_page_size=4096, use_legacy_dataset=use_legacy_dataset)
diff --git a/python/pyarrow/tests/test_plasma.py b/python/pyarrow/tests/test_plasma.py
index 967e69d..c574735 100644
--- a/python/pyarrow/tests/test_plasma.py
+++ b/python/pyarrow/tests/test_plasma.py
@@ -194,8 +194,8 @@ class TestPlasmaClient:
with_meta=True)
assert data_tuple[1].to_pybytes() == i * b'a'
assert (self.plasma_client.get_metadata(
- [object_ids[i]])[0].to_pybytes()
- == i * b'b')
+ [object_ids[i]])[0].to_pybytes() ==
+ i * b'b')
# Make sure that creating the same object twice raises an exception.
object_id = random_object_id()
@@ -262,7 +262,7 @@ class TestPlasmaClient:
[object_id], timeout_ms=1, with_meta=True)[0][1] is None
self.plasma_client.seal(object_id)
assert self.plasma_client.get_buffers(
- [object_id], timeout_ms=0, with_meta=True)[0][1]is not None
+ [object_id], timeout_ms=0, with_meta=True)[0][1] is not None
def test_buffer_lifetime(self):
# ARROW-2195
@@ -758,8 +758,8 @@ class TestPlasmaClient:
data_sizes = [np.random.randint(1000) + 1 for _ in range(i)]
for j in range(i):
x = self.plasma_client2.create(
- object_ids[j], data_sizes[j],
- metadata=bytearray(np.random.bytes(metadata_sizes[j])))
+ object_ids[j], data_sizes[j],
+ metadata=bytearray(np.random.bytes(metadata_sizes[j])))
self.plasma_client2.seal(object_ids[j])
del x
# Check that we received notifications for creating all of the
@@ -794,8 +794,8 @@ class TestPlasmaClient:
data_sizes.append(np.random.randint(1000))
for i in range(num_object_ids):
x = self.plasma_client2.create(
- object_ids[i], data_sizes[i],
- metadata=bytearray(np.random.bytes(metadata_sizes[i])))
+ object_ids[i], data_sizes[i],
+ metadata=bytearray(np.random.bytes(metadata_sizes[i])))
self.plasma_client2.seal(object_ids[i])
del x
for i in range(num_object_ids):
diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py
index ede2b33..f1e0cf4 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -134,7 +134,7 @@ def assert_equal(obj1, obj2):
assert obj1.equals(obj2)
else:
assert type(obj1) == type(obj2) and obj1 == obj2, \
- "Objects {} and {} are different.".format(obj1, obj2)
+ "Objects {} and {} are different.".format(obj1, obj2)
PRIMITIVE_OBJECTS = [
@@ -832,7 +832,7 @@ def test_pyarrow_objects_serialization(large_buffer):
# or it will affect 'test_total_bytes_allocated'.
pyarrow_objects = [
pa.array([1, 2, 3, 4]), pa.array(['1', 'never U+1F631', '',
- "233 * U+1F600"]),
+ "233 * U+1F600"]),
pa.array([1, None, 2, 3]),
pa.Tensor.from_numpy(np.random.rand(2, 3, 4)),
pa.RecordBatch.from_arrays(
@@ -841,7 +841,7 @@ def test_pyarrow_objects_serialization(large_buffer):
['a', 'b']),
pa.Table.from_arrays([pa.array([1, None, 2, 3]),
pa.array(['1', 'never U+1F631', '',
- "233 * u1F600"])],
+ "233 * u1F600"])],
['a', 'b'])
]
for obj in pyarrow_objects:
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 5f2e380..c2ba86a 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -100,6 +100,7 @@ cdef class DataType:
Each data type is an *instance* of this class.
"""
+
def __cinit__(self):
pass
@@ -215,6 +216,7 @@ cdef class DictionaryMemo:
"""
Tracking container for dictionary-encoded fields.
"""
+
def __cinit__(self):
self.sp_memo.reset(new CDictionaryMemo())
self.memo = self.sp_memo.get()
@@ -966,6 +968,7 @@ cdef class Field:
-----
Do not use this class's constructor directly; use pyarrow.field
"""
+
def __cinit__(self):
pass
diff --git a/python/setup.cfg b/python/setup.cfg
index 162a507..9aaad4f 100644
--- a/python/setup.cfg
+++ b/python/setup.cfg
@@ -28,3 +28,7 @@ build-dir = doc/_build
addopts = --ignore=scripts
filterwarnings =
error:The SparseDataFrame:FutureWarning
+
+[pep8]
+ignore = E211,E225,E226,E227,E402,W504
+max_line_length = 79
diff --git a/python/setup.py b/python/setup.py
index 134d6a4..f6f6b45 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -281,8 +281,8 @@ class build_ext(_build_ext):
# Do the build
print("-- Running cmake --build for pyarrow")
- self.spawn(['cmake', '--build', '.', '--config', self.build_type]
- + build_tool_args)
+ self.spawn(['cmake', '--build', '.', '--config', self.build_type] +
+ build_tool_args)
print("-- Finished cmake --build for pyarrow")
if self.inplace:
@@ -513,8 +513,8 @@ def _move_shared_libs_unix(build_prefix, build_lib, lib_name):
# If the event of not running from a git clone (e.g. from a git archive
# or a Python sdist), see if we can set the version number ourselves
default_version = '0.18.0-SNAPSHOT'
-if (not os.path.exists('../.git')
- and not os.environ.get('SETUPTOOLS_SCM_PRETEND_VERSION')):
+if (not os.path.exists('../.git') and
+ not os.environ.get('SETUPTOOLS_SCM_PRETEND_VERSION')):
if os.path.exists('PKG-INFO'):
# We're probably in a Python sdist, setuptools_scm will handle fine
pass