You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/05/18 15:08:31 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #7156: ARROW-8074: [C++][Dataset][Python] FileFragments from buffers and NativeFiles

pitrou commented on a change in pull request #7156:
URL: https://github.com/apache/arrow/pull/7156#discussion_r426687933



##########
File path: python/pyarrow/dataset.py
##########
@@ -314,6 +315,15 @@ def _ensure_multiple_sources(paths, filesystem=None):
     return filesystem, paths
 
 
+def _is_native_file_wrappable(source):
+    import io
+    return isinstance(source, (FileSource,
+                               io.BytesIO,

Review comment:
       You want `io.BufferedIOBase` here.

##########
File path: cpp/src/arrow/python/common.cc
##########
@@ -96,42 +109,38 @@ class PythonErrorDetail : public StatusDetail {
   }
 
   void RestorePyError() const {
-    Py_INCREF(exc_type_.obj());
-    Py_INCREF(exc_value_.obj());
-    Py_INCREF(exc_traceback_.obj());
-    PyErr_Restore(exc_type_.obj(), exc_value_.obj(), exc_traceback_.obj());
+    PyErr_Restore(exc_type_.incref(), exc_value_.incref(), exc_traceback_.incref());
   }
 
   PyObject* exc_type() const { return exc_type_.obj(); }
 
   PyObject* exc_value() const { return exc_value_.obj(); }
 
-  static std::shared_ptr<PythonErrorDetail> FromPyError() {
-    PyObject* exc_type = nullptr;
-    PyObject* exc_value = nullptr;
-    PyObject* exc_traceback = nullptr;
-
-    PyErr_Fetch(&exc_type, &exc_value, &exc_traceback);
-    PyErr_NormalizeException(&exc_type, &exc_value, &exc_traceback);
-    ARROW_CHECK(exc_type)
-        << "PythonErrorDetail::FromPyError called without a Python error set";
-    DCHECK(PyType_Check(exc_type));
-    DCHECK(exc_value);  // Ensured by PyErr_NormalizeException, double-check
-    if (exc_traceback == nullptr) {
-      // Needed by PyErr_Restore()
-      Py_INCREF(Py_None);
-      exc_traceback = Py_None;
+  Status ToStatus(StatusCode code = StatusCode::UnknownError) && {
+    if (code == StatusCode::UnknownError) {
+      code = MapPyError(exc_type());
     }
 
-    std::shared_ptr<PythonErrorDetail> detail(new PythonErrorDetail);
-    detail->exc_type_.reset(exc_type);
-    detail->exc_value_.reset(exc_value);
-    detail->exc_traceback_.reset(exc_traceback);
-    return detail;
+    std::string message;
+    RETURN_NOT_OK(internal::PyObject_StdStringStr(exc_value(), &message));
+
+    auto detail = std::make_shared<PythonErrorDetail>(std::move(*this));

Review comment:
       Not very pretty...

##########
File path: cpp/src/arrow/python/common.h
##########
@@ -170,17 +176,23 @@ class ARROW_PYTHON_EXPORT OwnedRef {
  private:
   ARROW_DISALLOW_COPY_AND_ASSIGN(OwnedRef);
 
-  PyObject* obj_;
+  PyObject* obj_ = NULLPTR;
 };
 
 // Same as OwnedRef, but ensures the GIL is taken when it goes out of scope.
 // This is for situations where the GIL is not always known to be held
 // (e.g. if it is released in the middle of a function for performance reasons)
 class ARROW_PYTHON_EXPORT OwnedRefNoGIL : public OwnedRef {
  public:
-  OwnedRefNoGIL() : OwnedRef() {}
-  OwnedRefNoGIL(OwnedRefNoGIL&& other) : OwnedRef(other.detach()) {}
-  explicit OwnedRefNoGIL(PyObject* obj) : OwnedRef(obj) {}
+  using OwnedRef::OwnedRef;
+  OwnedRefNoGIL() = default;
+  OwnedRefNoGIL(OwnedRefNoGIL&& other) = default;

Review comment:
       What does this do? Inherit from parent? This looks a bit ambiguous to me...

##########
File path: cpp/src/arrow/python/common.h
##########
@@ -303,5 +315,102 @@ static inline PyObject* cpp_PyObject_CallMethod(PyObject* obj, const char* metho
                              const_cast<char*>(argspec), args...);
 }
 
+template <typename Self, typename Fn>
+struct BoundMethod;
+
+template <typename Self, typename R, typename... A>
+struct BoundMethod<Self, R(A...)> {

Review comment:
       Do we really need this complication? What concrete problem is this solving?

##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -42,6 +43,51 @@ def _forbid_instantiation(klass, subclasses_instead=True):
     raise TypeError(msg)
 
 
+ctypedef CResult[shared_ptr[CRandomAccessFile]] CCustomOpen()
+
+cdef class FileSource:
+
+    cdef:
+        # XXX why is shared_ptr necessary here? CFileSource shouldn't need it
+        CFileSource wrapped
+
+    def __cinit__(self, file, FileSystem filesystem=None):
+        cdef:
+            shared_ptr[CFileSystem] c_filesystem
+            c_string c_path
+            function[CCustomOpen] c_open
+            shared_ptr[CBuffer] c_buffer
+
+        if isinstance(file, FileSource):
+            self.wrapped = (<FileSource> file).wrapped
+
+        elif isinstance(file, Buffer):
+            c_buffer = pyarrow_unwrap_buffer(file)
+            self.wrapped = CFileSource(move(c_buffer))
+
+        elif _is_path_like(file):
+            if filesystem is None:
+                raise ValueError("cannot construct a FileSource from "
+                                 "a path without a FileSystem")
+            c_filesystem = filesystem.unwrap()
+            c_path = tobytes(_stringify_path(file))
+            self.wrapped = CFileSource(move(c_path), move(c_filesystem))
+
+        else:
+            c_open = BindMethod[CCustomOpen](
+                wrap_python_file(file, mode='r'),
+                &NativeFile.get_random_access_file)
+            self.wrapped = CFileSource(move(c_open))
+
+    @staticmethod
+    def from_uri(uri):

Review comment:
       It's a bit weird to have a constructor that takes almost every kind of input, but a separate factory method for URIs. I'd say either have factory methods for everything, or accept everything in the constructor.

##########
File path: python/pyarrow/dataset.py
##########
@@ -411,7 +421,14 @@ def _filesystem_dataset(source, schema=None, filesystem=None,
     partitioning = _ensure_partitioning(partitioning)
 
     if isinstance(source, (list, tuple)):
-        fs, paths_or_selector = _ensure_multiple_sources(source, filesystem)
+        if all(_is_path_like(elem) for elem in source):
+            fs, paths_or_selector = _ensure_multiple_sources(source,
+                                                             filesystem)
+        else:
+            fs, paths_or_selector = _MockFileSystem(), source

Review comment:
       Why the MockFileSystem fallback here?

##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -42,6 +43,51 @@ def _forbid_instantiation(klass, subclasses_instead=True):
     raise TypeError(msg)
 
 
+ctypedef CResult[shared_ptr[CRandomAccessFile]] CCustomOpen()
+
+cdef class FileSource:
+
+    cdef:
+        # XXX why is shared_ptr necessary here? CFileSource shouldn't need it
+        CFileSource wrapped
+
+    def __cinit__(self, file, FileSystem filesystem=None):
+        cdef:
+            shared_ptr[CFileSystem] c_filesystem
+            c_string c_path
+            function[CCustomOpen] c_open
+            shared_ptr[CBuffer] c_buffer
+
+        if isinstance(file, FileSource):
+            self.wrapped = (<FileSource> file).wrapped
+
+        elif isinstance(file, Buffer):
+            c_buffer = pyarrow_unwrap_buffer(file)
+            self.wrapped = CFileSource(move(c_buffer))
+
+        elif _is_path_like(file):
+            if filesystem is None:
+                raise ValueError("cannot construct a FileSource from "
+                                 "a path without a FileSystem")
+            c_filesystem = filesystem.unwrap()
+            c_path = tobytes(_stringify_path(file))
+            self.wrapped = CFileSource(move(c_path), move(c_filesystem))
+
+        else:
+            c_open = BindMethod[CCustomOpen](
+                wrap_python_file(file, mode='r'),
+                &NativeFile.get_random_access_file)

Review comment:
       Is it any useful to make the NativeFile construction lazy? I'm not sure I understand what this is saving.

##########
File path: python/pyarrow/includes/libarrow_dataset.pxd
##########
@@ -194,7 +194,11 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
         const c_string& path() const
         const shared_ptr[CFileSystem]& filesystem() const
         const shared_ptr[CBuffer]& buffer() const
-        CFileSource(c_string path, shared_ptr[CFileSystem] filesystem)
+        # HACK: Cython can't handle all the overloads so don't declare them.

Review comment:
       I'm curious, did you report a bug to Cython?

##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -42,6 +43,51 @@ def _forbid_instantiation(klass, subclasses_instead=True):
     raise TypeError(msg)
 
 
+ctypedef CResult[shared_ptr[CRandomAccessFile]] CCustomOpen()
+
+cdef class FileSource:
+
+    cdef:
+        # XXX why is shared_ptr necessary here? CFileSource shouldn't need it

Review comment:
       Perhaps a bug in CFileSource's copy/move constructors or operators?

##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -42,6 +43,51 @@ def _forbid_instantiation(klass, subclasses_instead=True):
     raise TypeError(msg)
 
 
+ctypedef CResult[shared_ptr[CRandomAccessFile]] CCustomOpen()
+
+cdef class FileSource:
+
+    cdef:
+        # XXX why is shared_ptr necessary here? CFileSource shouldn't need it
+        CFileSource wrapped
+
+    def __cinit__(self, file, FileSystem filesystem=None):
+        cdef:
+            shared_ptr[CFileSystem] c_filesystem
+            c_string c_path
+            function[CCustomOpen] c_open
+            shared_ptr[CBuffer] c_buffer
+
+        if isinstance(file, FileSource):
+            self.wrapped = (<FileSource> file).wrapped
+
+        elif isinstance(file, Buffer):
+            c_buffer = pyarrow_unwrap_buffer(file)
+            self.wrapped = CFileSource(move(c_buffer))
+
+        elif _is_path_like(file):
+            if filesystem is None:
+                raise ValueError("cannot construct a FileSource from "
+                                 "a path without a FileSystem")

Review comment:
       Interesting, it doesn't use a local filesystem as default? (or doesn't accept a URI?)

##########
File path: python/pyarrow/io.pxi
##########
@@ -1377,6 +1377,21 @@ cdef shared_ptr[CBuffer] as_c_buffer(object o) except *:
     return buf
 
 
+cdef NativeFile wrap_python_file(object source, str mode):

Review comment:
       Is there a reason not to reuse `_get_native_file`?

##########
File path: python/pyarrow/tests/test_dataset.py
##########
@@ -1425,3 +1425,13 @@ def test_feather_format(tempdir):
     write_feather(table, str(basedir / "data1.feather"), version=1)
     with pytest.raises(ValueError):
         ds.dataset(basedir, format="feather").to_table()
+
+
+def test_file_source_refcount():
+    from io import BytesIO
+    from sys import getrefcount

Review comment:
       Please put all standard imports at the top level.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org