You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2022/03/18 05:04:13 UTC

[spark] branch branch-3.3 updated: [SPARK-38563][PYTHON] Upgrade to Py4J 0.10.9.5

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
     new 7bb1d6f  [SPARK-38563][PYTHON] Upgrade to Py4J 0.10.9.5
7bb1d6f is described below

commit 7bb1d6f01148b037acad12de8166cf742cd30ea3
Author: Hyukjin Kwon <gu...@apache.org>
AuthorDate: Fri Mar 18 14:00:48 2022 +0900

    [SPARK-38563][PYTHON] Upgrade to Py4J 0.10.9.5
    
    ### What changes were proposed in this pull request?
    
    This PR is a retry of https://github.com/apache/spark/pull/35871 with bumping up the version to 0.10.9.5.
    It was reverted because of Python 3.10 is broken, and Python 3.10 was not officially supported in Py4J.
    
    In Py4J 0.10.9.5, the issue was fixed (https://github.com/py4j/py4j/pull/475), and it added Python 3.10 support officially with CI set up (https://github.com/py4j/py4j/pull/477).
    
    ### Why are the changes needed?
    
    See https://github.com/apache/spark/pull/35871
    
    ### Does this PR introduce _any_ user-facing change?
    
    See https://github.com/apache/spark/pull/35871
    
    ### How was this patch tested?
    
    Py4J sets up Python 3.10 CI now, and I manually tested PySpark with Python 3.10 with this patch:
    
    ```bash
    ./bin/pyspark
    ```
    
    ```
    import py4j
    py4j.__version__
    spark.range(10).show()
    ```
    
    ```
    Using Python version 3.10.0 (default, Mar  3 2022 03:57:21)
    Spark context Web UI available at http://172.30.5.50:4040
    Spark context available as 'sc' (master = local[*], app id = local-1647571387534).
    SparkSession available as 'spark'.
    >>> import py4j
    >>> py4j.__version__
    '0.10.9.5'
    >>> spark.range(10).show()
    +---+
    | id|
    +---+
    ...
    ```
    
    Closes #35907 from HyukjinKwon/SPARK-38563-followup.
    
    Authored-by: Hyukjin Kwon <gu...@apache.org>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
    (cherry picked from commit 97335ea037a9a036c013c86ef62d74ca638f808e)
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 bin/pyspark                                        |   2 +-
 bin/pyspark2.cmd                                   |   2 +-
 core/pom.xml                                       |   2 +-
 .../org/apache/spark/api/python/PythonUtils.scala  |   2 +-
 dev/deps/spark-deps-hadoop-2-hive-2.3              |   2 +-
 dev/deps/spark-deps-hadoop-3-hive-2.3              |   2 +-
 docs/job-scheduling.md                             |   2 +-
 python/docs/Makefile                               |   2 +-
 python/docs/make2.bat                              |   2 +-
 python/docs/source/getting_started/install.rst     |   2 +-
 python/lib/py4j-0.10.9.3-src.zip                   | Bin 42021 -> 0 bytes
 python/lib/py4j-0.10.9.5-src.zip                   | Bin 0 -> 42404 bytes
 python/pyspark/context.py                          |   6 ++--
 python/pyspark/util.py                             |  35 +++------------------
 python/setup.py                                    |   2 +-
 sbin/spark-config.sh                               |   2 +-
 16 files changed, 20 insertions(+), 45 deletions(-)

diff --git a/bin/pyspark b/bin/pyspark
index 4840589..21a514e 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -50,7 +50,7 @@ export PYSPARK_DRIVER_PYTHON_OPTS
 
 # Add the PySpark classes to the Python path:
 export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH"
-export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.3-src.zip:$PYTHONPATH"
+export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.5-src.zip:$PYTHONPATH"
 
 # Load the PySpark shell.py script when ./pyspark is used interactively:
 export OLD_PYTHONSTARTUP="$PYTHONSTARTUP"
diff --git a/bin/pyspark2.cmd b/bin/pyspark2.cmd
index a19627a..eec02a4 100644
--- a/bin/pyspark2.cmd
+++ b/bin/pyspark2.cmd
@@ -30,7 +30,7 @@ if "x%PYSPARK_DRIVER_PYTHON%"=="x" (
 )
 
 set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH%
-set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.9.3-src.zip;%PYTHONPATH%
+set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.9.5-src.zip;%PYTHONPATH%
 
 set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%
 set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py
diff --git a/core/pom.xml b/core/pom.xml
index 9d3b170..a753a59 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -423,7 +423,7 @@
     <dependency>
       <groupId>net.sf.py4j</groupId>
       <artifactId>py4j</artifactId>
-      <version>0.10.9.3</version>
+      <version>0.10.9.5</version>
     </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
index 8daba86..6336171 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
@@ -27,7 +27,7 @@ import org.apache.spark.SparkContext
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 
 private[spark] object PythonUtils {
-  val PY4J_ZIP_NAME = "py4j-0.10.9.3-src.zip"
+  val PY4J_ZIP_NAME = "py4j-0.10.9.5-src.zip"
 
   /** Get the PYTHONPATH for PySpark, either from SPARK_HOME, if it is set, or from our JAR */
   def sparkPythonPath: String = {
diff --git a/dev/deps/spark-deps-hadoop-2-hive-2.3 b/dev/deps/spark-deps-hadoop-2-hive-2.3
index bcbf8b9..28cb7c1 100644
--- a/dev/deps/spark-deps-hadoop-2-hive-2.3
+++ b/dev/deps/spark-deps-hadoop-2-hive-2.3
@@ -233,7 +233,7 @@ parquet-hadoop/1.12.2//parquet-hadoop-1.12.2.jar
 parquet-jackson/1.12.2//parquet-jackson-1.12.2.jar
 pickle/1.2//pickle-1.2.jar
 protobuf-java/2.5.0//protobuf-java-2.5.0.jar
-py4j/0.10.9.3//py4j-0.10.9.3.jar
+py4j/0.10.9.5//py4j-0.10.9.5.jar
 remotetea-oncrpc/1.1.2//remotetea-oncrpc-1.1.2.jar
 rocksdbjni/6.20.3//rocksdbjni-6.20.3.jar
 scala-collection-compat_2.12/2.1.1//scala-collection-compat_2.12-2.1.1.jar
diff --git a/dev/deps/spark-deps-hadoop-3-hive-2.3 b/dev/deps/spark-deps-hadoop-3-hive-2.3
index 8ca7880..07549ef 100644
--- a/dev/deps/spark-deps-hadoop-3-hive-2.3
+++ b/dev/deps/spark-deps-hadoop-3-hive-2.3
@@ -221,7 +221,7 @@ parquet-hadoop/1.12.2//parquet-hadoop-1.12.2.jar
 parquet-jackson/1.12.2//parquet-jackson-1.12.2.jar
 pickle/1.2//pickle-1.2.jar
 protobuf-java/2.5.0//protobuf-java-2.5.0.jar
-py4j/0.10.9.3//py4j-0.10.9.3.jar
+py4j/0.10.9.5//py4j-0.10.9.5.jar
 remotetea-oncrpc/1.1.2//remotetea-oncrpc-1.1.2.jar
 rocksdbjni/6.20.3//rocksdbjni-6.20.3.jar
 scala-collection-compat_2.12/2.1.1//scala-collection-compat_2.12-2.1.1.jar
diff --git a/docs/job-scheduling.md b/docs/job-scheduling.md
index 4ed2aa9..f44ed82 100644
--- a/docs/job-scheduling.md
+++ b/docs/job-scheduling.md
@@ -304,5 +304,5 @@ via `sc.setJobGroup` in a separate PVM thread, which also disallows to cancel th
 later.
 
 `pyspark.InheritableThread` is recommended to use together for a PVM thread to inherit the inheritable attributes
- such as local properties in a JVM thread, and to avoid resource leak.
+ such as local properties in a JVM thread.
 
diff --git a/python/docs/Makefile b/python/docs/Makefile
index 9cb1a17..14e5214 100644
--- a/python/docs/Makefile
+++ b/python/docs/Makefile
@@ -21,7 +21,7 @@ SPHINXBUILD   ?= sphinx-build
 SOURCEDIR     ?= source
 BUILDDIR      ?= build
 
-export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.10.9.3-src.zip)
+export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.10.9.5-src.zip)
 
 # Put it first so that "make" without argument is like "make help".
 help:
diff --git a/python/docs/make2.bat b/python/docs/make2.bat
index 2e4e2b5..d36b7a1 100644
--- a/python/docs/make2.bat
+++ b/python/docs/make2.bat
@@ -25,7 +25,7 @@ if "%SPHINXBUILD%" == "" (
 set SOURCEDIR=source
 set BUILDDIR=build
 
-set PYTHONPATH=..;..\lib\py4j-0.10.9.3-src.zip
+set PYTHONPATH=..;..\lib\py4j-0.10.9.5-src.zip
 
 if "%1" == "" goto help
 
diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst
index 15a1240..479f8bf 100644
--- a/python/docs/source/getting_started/install.rst
+++ b/python/docs/source/getting_started/install.rst
@@ -157,7 +157,7 @@ Package       Minimum supported version Note
 `pandas`      1.0.5                     Optional for Spark SQL
 `NumPy`       1.7                       Required for MLlib DataFrame-based API
 `pyarrow`     1.0.0                     Optional for Spark SQL
-`Py4J`        0.10.9.3                  Required
+`Py4J`        0.10.9.5                  Required
 `pandas`      1.0.5                     Required for pandas API on Spark
 `pyarrow`     1.0.0                     Required for pandas API on Spark
 `Numpy`       1.14                      Required for pandas API on Spark
diff --git a/python/lib/py4j-0.10.9.3-src.zip b/python/lib/py4j-0.10.9.3-src.zip
deleted file mode 100644
index 428f3ac..0000000
Binary files a/python/lib/py4j-0.10.9.3-src.zip and /dev/null differ
diff --git a/python/lib/py4j-0.10.9.5-src.zip b/python/lib/py4j-0.10.9.5-src.zip
new file mode 100644
index 0000000..478d4b0
Binary files /dev/null and b/python/lib/py4j-0.10.9.5-src.zip differ
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index e47f162..59b5fa7 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -1365,7 +1365,7 @@ class SparkContext:
         to HDFS-1208, where HDFS may respond to Thread.interrupt() by marking nodes as dead.
 
         If you run jobs in parallel, use :class:`pyspark.InheritableThread` for thread
-        local inheritance, and preventing resource leak.
+        local inheritance.
 
         Examples
         --------
@@ -1405,7 +1405,7 @@ class SparkContext:
         Notes
         -----
         If you run jobs in parallel, use :class:`pyspark.InheritableThread` for thread
-        local inheritance, and preventing resource leak.
+        local inheritance.
         """
         self._jsc.setLocalProperty(key, value)
 
@@ -1423,7 +1423,7 @@ class SparkContext:
         Notes
         -----
         If you run jobs in parallel, use :class:`pyspark.InheritableThread` for thread
-        local inheritance, and preventing resource leak.
+        local inheritance.
         """
         self._jsc.setJobDescription(value)
 
diff --git a/python/pyspark/util.py b/python/pyspark/util.py
index 5abbbb9..b7b972a 100644
--- a/python/pyspark/util.py
+++ b/python/pyspark/util.py
@@ -331,13 +331,10 @@ def inheritable_thread_target(f: Callable) -> Callable:
 
         @functools.wraps(f)
         def wrapped(*args: Any, **kwargs: Any) -> Any:
-            try:
-                # Set local properties in child thread.
-                assert SparkContext._active_spark_context is not None
-                SparkContext._active_spark_context._jsc.sc().setLocalProperties(properties)
-                return f(*args, **kwargs)
-            finally:
-                InheritableThread._clean_py4j_conn_for_current_thread()
+            # Set local properties in child thread.
+            assert SparkContext._active_spark_context is not None
+            SparkContext._active_spark_context._jsc.sc().setLocalProperties(properties)
+            return f(*args, **kwargs)
 
         return wrapped
     else:
@@ -377,10 +374,7 @@ class InheritableThread(threading.Thread):
                 assert hasattr(self, "_props")
                 assert SparkContext._active_spark_context is not None
                 SparkContext._active_spark_context._jsc.sc().setLocalProperties(self._props)
-                try:
-                    return target(*a, **k)
-                finally:
-                    InheritableThread._clean_py4j_conn_for_current_thread()
+                return target(*a, **k)
 
             super(InheritableThread, self).__init__(
                 target=copy_local_properties, *args, **kwargs  # type: ignore[misc]
@@ -401,25 +395,6 @@ class InheritableThread(threading.Thread):
             self._props = SparkContext._active_spark_context._jsc.sc().getLocalProperties().clone()
         return super(InheritableThread, self).start()
 
-    @staticmethod
-    def _clean_py4j_conn_for_current_thread() -> None:
-        from pyspark import SparkContext
-
-        jvm = SparkContext._jvm
-        assert jvm is not None
-        thread_connection = jvm._gateway_client.get_thread_connection()
-        if thread_connection is not None:
-            try:
-                # Dequeue is shared across other threads but it's thread-safe.
-                # If this function has to be invoked one more time in the same thead
-                # Py4J will create a new connection automatically.
-                jvm._gateway_client.deque.remove(thread_connection)
-            except ValueError:
-                # Should never reach this point
-                return
-            finally:
-                thread_connection.close()
-
 
 if __name__ == "__main__":
     if "pypy" not in platform.python_implementation().lower() and sys.version_info[:2] >= (3, 7):
diff --git a/python/setup.py b/python/setup.py
index 673b146..a8d16ff 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -258,7 +258,7 @@ try:
         license='http://www.apache.org/licenses/LICENSE-2.0',
         # Don't forget to update python/docs/source/getting_started/install.rst
         # if you're updating the versions or dependencies.
-        install_requires=['py4j==0.10.9.3'],
+        install_requires=['py4j==0.10.9.5'],
         extras_require={
             'ml': ['numpy>=1.15'],
             'mllib': ['numpy>=1.15'],
diff --git a/sbin/spark-config.sh b/sbin/spark-config.sh
index f27b6fe..6044de2 100755
--- a/sbin/spark-config.sh
+++ b/sbin/spark-config.sh
@@ -28,6 +28,6 @@ export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"${SPARK_HOME}/conf"}"
 # Add the PySpark classes to the PYTHONPATH:
 if [ -z "${PYSPARK_PYTHONPATH_SET}" ]; then
   export PYTHONPATH="${SPARK_HOME}/python:${PYTHONPATH}"
-  export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.3-src.zip:${PYTHONPATH}"
+  export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.5-src.zip:${PYTHONPATH}"
   export PYSPARK_PYTHONPATH_SET=1
 fi

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org