You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2019/02/21 10:07:15 UTC

[arrow] branch master updated: ARROW-4559: [Python] Allow Parquet files with special characters in their names

This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 6711404  ARROW-4559: [Python] Allow Parquet files with special characters in their names
6711404 is described below

commit 6711404a131f2ebf7d41f062d69eed6018108f71
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Thu Feb 21 11:06:50 2019 +0100

    ARROW-4559: [Python] Allow Parquet files with special characters in their names
    
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #3718 from pitrou/ARROW-4559-special-chars-filename and squashes the following commits:
    
    2e854426 <Antoine Pitrou> ARROW-4559:  Allow Parquet files with special characters in their names
---
 python/pyarrow/filesystem.py            | 14 ++++++++++---
 python/pyarrow/tests/test_filesystem.py | 37 +++++++++++++++++++++++++++++++++
 python/pyarrow/tests/test_parquet.py    | 11 ++++++++++
 3 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/filesystem.py b/python/pyarrow/filesystem.py
index 64148d3..98fb773 100644
--- a/python/pyarrow/filesystem.py
+++ b/python/pyarrow/filesystem.py
@@ -399,7 +399,8 @@ def _ensure_filesystem(fs):
 
 def resolve_filesystem_and_path(where, filesystem=None):
     """
-    return filesystem from path which could be an HDFS URI
+    Return filesystem from path which could be an HDFS URI, a local URI,
+    or a plain filesystem path.
     """
     if not _is_path_like(where):
         if filesystem is not None:
@@ -407,7 +408,6 @@ def resolve_filesystem_and_path(where, filesystem=None):
                              " there is nothing to open with filesystem.")
         return filesystem, where
 
-    # input can be hdfs URI such as hdfs://host:port/myfile.parquet
     path = _stringify_path(where)
 
     if filesystem is not None:
@@ -415,6 +415,7 @@ def resolve_filesystem_and_path(where, filesystem=None):
 
     parsed_uri = urlparse(path)
     if parsed_uri.scheme == 'hdfs':
+        # Input is hdfs URI such as hdfs://host:port/myfile.parquet
         netloc_split = parsed_uri.netloc.split(':')
         host = netloc_split[0]
         if host == '':
@@ -423,7 +424,14 @@ def resolve_filesystem_and_path(where, filesystem=None):
         if len(netloc_split) == 2 and netloc_split[1].isnumeric():
             port = int(netloc_split[1])
         fs = pa.hdfs.connect(host=host, port=port)
+        fs_path = parsed_uri.path
+    elif parsed_uri.scheme == 'file':
+        # Input is local URI such as file:///home/user/myfile.parquet
+        fs = LocalFileSystem.get_instance()
+        fs_path = parsed_uri.path
     else:
+        # Input is local path such as /home/user/myfile.parquet
         fs = LocalFileSystem.get_instance()
+        fs_path = where
 
-    return fs, parsed_uri.path
+    return fs, fs_path
diff --git a/python/pyarrow/tests/test_filesystem.py b/python/pyarrow/tests/test_filesystem.py
new file mode 100644
index 0000000..4a6606f
--- /dev/null
+++ b/python/pyarrow/tests/test_filesystem.py
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from pyarrow import filesystem
+
+
+def test_resolve_uri():
+    uri = "file:///home/user/myfile.parquet"
+    fs, path = filesystem.resolve_filesystem_and_path(uri)
+    assert isinstance(fs, filesystem.LocalFileSystem)
+    assert path == "/home/user/myfile.parquet"
+
+
+def test_resolve_local_path():
+    for uri in ['/home/user/myfile.parquet',
+                'myfile.parquet',
+                'my # file ? parquet',
+                'C:/Windows/myfile.parquet',
+                r'C:\\Windows\\myfile.parquet',
+                ]:
+        fs, path = filesystem.resolve_filesystem_and_path(uri)
+        assert isinstance(fs, filesystem.LocalFileSystem)
+        assert path == uri
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index dc7fbef..2dae713 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -199,6 +199,17 @@ def test_no_memory_map(tempdir):
     assert table_read.equals(table)
 
 
+def test_special_chars_filename(tempdir):
+    table = pa.Table.from_arrays([pa.array([42])], ["ints"])
+    filename = "foo # bar"
+    path = tempdir / filename
+    assert not path.exists()
+    _write_table(table, str(path))
+    assert path.exists()
+    table_read = _read_table(str(path))
+    assert table_read.equals(table)
+
+
 def test_empty_table_roundtrip():
     df = alltypes_sample(size=10)
     # The nanosecond->us conversion is a nuisance, so we just avoid it here