You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2024/01/11 18:00:56 UTC

(arrow) branch main updated: GH-37164: [Python] Attach Python stacktrace to errors in `ConvertPyError` (#39380)

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 6fe7480125 GH-37164: [Python] Attach Python stacktrace to errors in `ConvertPyError` (#39380)
6fe7480125 is described below

commit 6fe7480125b7fdb3a000a27fcc9cf464697b8a60
Author: Will Jones <wi...@gmail.com>
AuthorDate: Thu Jan 11 10:00:49 2024 -0800

    GH-37164: [Python] Attach Python stacktrace to errors in `ConvertPyError` (#39380)
    
    ### Rationale for this change
    
    Users might define Python generators that are used in RecordBatchReaders and then exported through the C Data Interface. However, if an error occurs in their generator, the stacktrace and message are currently swallowed in the current `ConvertPyError` implementation, which only provides the type of error. This makes debugging code that passed RBRs difficult.
    
    ### What changes are included in this PR?
    
    Changes `ConvertPyError` to provide the fully formatted traceback in the error message.
    
    ### Are these changes tested?
    
    Yes, added one test to validate the errors messages are propagated.
    
    ### Are there any user-facing changes?
    
    This is a minor change in the error reporting behavior, which will provide more information.
    * Closes: #37164
    
    Authored-by: Will Jones <wi...@gmail.com>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 python/pyarrow/src/arrow/python/common.cc      | 49 ++++++++++++++++++++++++--
 python/pyarrow/src/arrow/python/python_test.cc | 17 ++++++---
 python/pyarrow/tests/test_cffi.py              | 34 ++++++++++++++++++
 3 files changed, 92 insertions(+), 8 deletions(-)

diff --git a/python/pyarrow/src/arrow/python/common.cc b/python/pyarrow/src/arrow/python/common.cc
index 6fe2ed4dae..2f44a9122f 100644
--- a/python/pyarrow/src/arrow/python/common.cc
+++ b/python/pyarrow/src/arrow/python/common.cc
@@ -19,6 +19,7 @@
 
 #include <cstdlib>
 #include <mutex>
+#include <sstream>
 #include <string>
 
 #include "arrow/memory_pool.h"
@@ -90,9 +91,15 @@ class PythonErrorDetail : public StatusDetail {
 
   std::string ToString() const override {
     // This is simple enough not to need the GIL
-    const auto ty = reinterpret_cast<const PyTypeObject*>(exc_type_.obj());
-    // XXX Should we also print traceback?
-    return std::string("Python exception: ") + ty->tp_name;
+    Result<std::string> result = FormatImpl();
+
+    if (result.ok()) {
+      return result.ValueOrDie();
+    } else {
+      // Fallback to just the exception type
+      const auto ty = reinterpret_cast<const PyTypeObject*>(exc_type_.obj());
+      return std::string("Python exception: ") + ty->tp_name;
+    }
   }
 
   void RestorePyError() const {
@@ -131,6 +138,42 @@ class PythonErrorDetail : public StatusDetail {
   }
 
  protected:
+  Result<std::string> FormatImpl() const {
+    PyAcquireGIL lock;
+
+    // Use traceback.format_exception()
+    OwnedRef traceback_module;
+    RETURN_NOT_OK(internal::ImportModule("traceback", &traceback_module));
+
+    OwnedRef fmt_exception;
+    RETURN_NOT_OK(internal::ImportFromModule(traceback_module.obj(), "format_exception",
+                                             &fmt_exception));
+
+    OwnedRef formatted;
+    formatted.reset(PyObject_CallFunctionObjArgs(fmt_exception.obj(), exc_type_.obj(),
+                                                 exc_value_.obj(), exc_traceback_.obj(),
+                                                 NULL));
+    RETURN_IF_PYERROR();
+
+    std::stringstream ss;
+    ss << "Python exception: ";
+    Py_ssize_t num_lines = PySequence_Length(formatted.obj());
+    RETURN_IF_PYERROR();
+
+    for (Py_ssize_t i = 0; i < num_lines; ++i) {
+      Py_ssize_t line_size;
+
+      PyObject* line = PySequence_GetItem(formatted.obj(), i);
+      RETURN_IF_PYERROR();
+
+      const char* data = PyUnicode_AsUTF8AndSize(line, &line_size);
+      RETURN_IF_PYERROR();
+
+      ss << std::string_view(data, line_size);
+    }
+    return ss.str();
+  }
+
   PythonErrorDetail() = default;
 
   OwnedRefNoGIL exc_type_, exc_value_, exc_traceback_;
diff --git a/python/pyarrow/src/arrow/python/python_test.cc b/python/pyarrow/src/arrow/python/python_test.cc
index 01ab8a3038..746bf41091 100644
--- a/python/pyarrow/src/arrow/python/python_test.cc
+++ b/python/pyarrow/src/arrow/python/python_test.cc
@@ -174,10 +174,14 @@ Status TestOwnedRefNoGILMoves() {
   }
 }
 
-std::string FormatPythonException(const std::string& exc_class_name) {
+std::string FormatPythonException(const std::string& exc_class_name,
+                                  const std::string& exc_value) {
   std::stringstream ss;
   ss << "Python exception: ";
   ss << exc_class_name;
+  ss << ": ";
+  ss << exc_value;
+  ss << "\n";
   return ss.str();
 }
 
@@ -205,7 +209,8 @@ Status TestCheckPyErrorStatus() {
   }
 
   PyErr_SetString(PyExc_TypeError, "some error");
-  ASSERT_OK(check_error(st, "some error", FormatPythonException("TypeError")));
+  ASSERT_OK(
+      check_error(st, "some error", FormatPythonException("TypeError", "some error")));
   ASSERT_TRUE(st.IsTypeError());
 
   PyErr_SetString(PyExc_ValueError, "some error");
@@ -223,7 +228,8 @@ Status TestCheckPyErrorStatus() {
   }
 
   PyErr_SetString(PyExc_NotImplementedError, "some error");
-  ASSERT_OK(check_error(st, "some error", FormatPythonException("NotImplementedError")));
+  ASSERT_OK(check_error(st, "some error",
+                        FormatPythonException("NotImplementedError", "some error")));
   ASSERT_TRUE(st.IsNotImplemented());
 
   // No override if a specific status code is given
@@ -246,7 +252,8 @@ Status TestCheckPyErrorStatusNoGIL() {
     lock.release();
     ASSERT_TRUE(st.IsUnknownError());
     ASSERT_EQ(st.message(), "zzzt");
-    ASSERT_EQ(st.detail()->ToString(), FormatPythonException("ZeroDivisionError"));
+    ASSERT_EQ(st.detail()->ToString(),
+              FormatPythonException("ZeroDivisionError", "zzzt"));
     return Status::OK();
   }
 }
@@ -257,7 +264,7 @@ Status TestRestorePyErrorBasics() {
   ASSERT_FALSE(PyErr_Occurred());
   ASSERT_TRUE(st.IsUnknownError());
   ASSERT_EQ(st.message(), "zzzt");
-  ASSERT_EQ(st.detail()->ToString(), FormatPythonException("ZeroDivisionError"));
+  ASSERT_EQ(st.detail()->ToString(), FormatPythonException("ZeroDivisionError", "zzzt"));
 
   RestorePyError(st);
   ASSERT_TRUE(PyErr_Occurred());
diff --git a/python/pyarrow/tests/test_cffi.py b/python/pyarrow/tests/test_cffi.py
index a9c17cc100..ff81b06440 100644
--- a/python/pyarrow/tests/test_cffi.py
+++ b/python/pyarrow/tests/test_cffi.py
@@ -414,6 +414,40 @@ def test_export_import_batch_reader(reader_factory):
         pa.RecordBatchReader._import_from_c(ptr_stream)
 
 
+@needs_cffi
+def test_export_import_exception_reader():
+    # See: https://github.com/apache/arrow/issues/37164
+    c_stream = ffi.new("struct ArrowArrayStream*")
+    ptr_stream = int(ffi.cast("uintptr_t", c_stream))
+
+    gc.collect()  # Make sure no Arrow data dangles in a ref cycle
+    old_allocated = pa.total_allocated_bytes()
+
+    def gen():
+        if True:
+            try:
+                raise ValueError('foo')
+            except ValueError as e:
+                raise NotImplementedError('bar') from e
+        else:
+            yield from make_batches()
+
+    original = pa.RecordBatchReader.from_batches(make_schema(), gen())
+    original._export_to_c(ptr_stream)
+
+    reader = pa.RecordBatchReader._import_from_c(ptr_stream)
+    with pytest.raises(OSError) as exc_info:
+        reader.read_next_batch()
+
+    # inner *and* outer exception should be present
+    assert 'ValueError: foo' in str(exc_info.value)
+    assert 'NotImplementedError: bar' in str(exc_info.value)
+    # Stacktrace containing line of the raise statement
+    assert 'raise ValueError(\'foo\')' in str(exc_info.value)
+
+    assert pa.total_allocated_bytes() == old_allocated
+
+
 @needs_cffi
 def test_imported_batch_reader_error():
     c_stream = ffi.new("struct ArrowArrayStream*")