You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2019/02/21 10:07:15 UTC
[arrow] branch master updated: ARROW-4559: [Python] Allow Parquet
files with special characters in their names
This is an automated email from the ASF dual-hosted git repository.
kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 6711404 ARROW-4559: [Python] Allow Parquet files with special characters in their names
6711404 is described below
commit 6711404a131f2ebf7d41f062d69eed6018108f71
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Thu Feb 21 11:06:50 2019 +0100
ARROW-4559: [Python] Allow Parquet files with special characters in their names
Author: Antoine Pitrou <an...@python.org>
Closes #3718 from pitrou/ARROW-4559-special-chars-filename and squashes the following commits:
2e854426 <Antoine Pitrou> ARROW-4559: Allow Parquet files with special characters in their names
---
python/pyarrow/filesystem.py | 14 ++++++++++---
python/pyarrow/tests/test_filesystem.py | 37 +++++++++++++++++++++++++++++++++
python/pyarrow/tests/test_parquet.py | 11 ++++++++++
3 files changed, 59 insertions(+), 3 deletions(-)
diff --git a/python/pyarrow/filesystem.py b/python/pyarrow/filesystem.py
index 64148d3..98fb773 100644
--- a/python/pyarrow/filesystem.py
+++ b/python/pyarrow/filesystem.py
@@ -399,7 +399,8 @@ def _ensure_filesystem(fs):
def resolve_filesystem_and_path(where, filesystem=None):
"""
- return filesystem from path which could be an HDFS URI
+ Return filesystem from path which could be an HDFS URI, a local URI,
+ or a plain filesystem path.
"""
if not _is_path_like(where):
if filesystem is not None:
@@ -407,7 +408,6 @@ def resolve_filesystem_and_path(where, filesystem=None):
" there is nothing to open with filesystem.")
return filesystem, where
- # input can be hdfs URI such as hdfs://host:port/myfile.parquet
path = _stringify_path(where)
if filesystem is not None:
@@ -415,6 +415,7 @@ def resolve_filesystem_and_path(where, filesystem=None):
parsed_uri = urlparse(path)
if parsed_uri.scheme == 'hdfs':
+ # Input is hdfs URI such as hdfs://host:port/myfile.parquet
netloc_split = parsed_uri.netloc.split(':')
host = netloc_split[0]
if host == '':
@@ -423,7 +424,14 @@ def resolve_filesystem_and_path(where, filesystem=None):
if len(netloc_split) == 2 and netloc_split[1].isnumeric():
port = int(netloc_split[1])
fs = pa.hdfs.connect(host=host, port=port)
+ fs_path = parsed_uri.path
+ elif parsed_uri.scheme == 'file':
+ # Input is local URI such as file:///home/user/myfile.parquet
+ fs = LocalFileSystem.get_instance()
+ fs_path = parsed_uri.path
else:
+ # Input is local path such as /home/user/myfile.parquet
fs = LocalFileSystem.get_instance()
+ fs_path = where
- return fs, parsed_uri.path
+ return fs, fs_path
diff --git a/python/pyarrow/tests/test_filesystem.py b/python/pyarrow/tests/test_filesystem.py
new file mode 100644
index 0000000..4a6606f
--- /dev/null
+++ b/python/pyarrow/tests/test_filesystem.py
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from pyarrow import filesystem
+
+
+def test_resolve_uri():
+ uri = "file:///home/user/myfile.parquet"
+ fs, path = filesystem.resolve_filesystem_and_path(uri)
+ assert isinstance(fs, filesystem.LocalFileSystem)
+ assert path == "/home/user/myfile.parquet"
+
+
+def test_resolve_local_path():
+ for uri in ['/home/user/myfile.parquet',
+ 'myfile.parquet',
+ 'my # file ? parquet',
+ 'C:/Windows/myfile.parquet',
+ r'C:\\Windows\\myfile.parquet',
+ ]:
+ fs, path = filesystem.resolve_filesystem_and_path(uri)
+ assert isinstance(fs, filesystem.LocalFileSystem)
+ assert path == uri
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index dc7fbef..2dae713 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -199,6 +199,17 @@ def test_no_memory_map(tempdir):
assert table_read.equals(table)
+def test_special_chars_filename(tempdir):
+ table = pa.Table.from_arrays([pa.array([42])], ["ints"])
+ filename = "foo # bar"
+ path = tempdir / filename
+ assert not path.exists()
+ _write_table(table, str(path))
+ assert path.exists()
+ table_read = _read_table(str(path))
+ assert table_read.equals(table)
+
+
def test_empty_table_roundtrip():
df = alltypes_sample(size=10)
# The nanosecond->us conversion is a nuisance, so we just avoid it here