You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2023/03/08 18:00:27 UTC
[spark] branch branch-3.4 updated: [SPARK-42709][PYTHON] Remove the assumption of `__file__` being available
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.4 by this push:
new 5aae9707450 [SPARK-42709][PYTHON] Remove the assumption of `__file__` being available
5aae9707450 is described below
commit 5aae970745007e2b5fcbcf6491ed3a72e93f4763
Author: Hyukjin Kwon <gu...@apache.org>
AuthorDate: Wed Mar 8 09:59:33 2023 -0800
[SPARK-42709][PYTHON] Remove the assumption of `__file__` being available
### What changes were proposed in this pull request?
This PR proposes to add a check for `__file__` attributes.
### Why are the changes needed?
`__file__` might not be available everywhere. See also https://github.com/scikit-learn/scikit-learn/issues/20081
### Does this PR introduce _any_ user-facing change?
If users' Python environment does not have `__file__`, now users can use PySpark in their environment too.
### How was this patch tested?
Manually tested.
Closes #40328 from HyukjinKwon/SPARK-42709.
Authored-by: Hyukjin Kwon <gu...@apache.org>
Signed-off-by: Dongjoon Hyun <do...@apache.org>
(cherry picked from commit f95fc19d9491fa79a3f34cfb721b4919c9b3bb0f)
Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
python/pyspark/find_spark_home.py | 10 +++++++---
python/pyspark/sql/connect/catalog.py | 2 +-
python/pyspark/sql/connect/client.py | 2 +-
python/pyspark/sql/connect/column.py | 2 +-
python/pyspark/sql/connect/conversion.py | 2 +-
python/pyspark/sql/connect/dataframe.py | 2 +-
python/pyspark/sql/connect/expressions.py | 2 +-
python/pyspark/sql/connect/functions.py | 2 +-
python/pyspark/sql/connect/group.py | 2 +-
python/pyspark/sql/connect/plan.py | 2 +-
python/pyspark/sql/connect/readwriter.py | 2 +-
python/pyspark/sql/connect/session.py | 2 +-
python/pyspark/sql/connect/types.py | 2 +-
python/pyspark/sql/connect/udf.py | 2 +-
python/pyspark/sql/connect/utils.py | 4 ++--
python/pyspark/sql/connect/window.py | 2 +-
python/pyspark/worker.py | 19 ++++++++++---------
17 files changed, 33 insertions(+), 28 deletions(-)
diff --git a/python/pyspark/find_spark_home.py b/python/pyspark/find_spark_home.py
index 09f4551ea5f..a2226f8385e 100755
--- a/python/pyspark/find_spark_home.py
+++ b/python/pyspark/find_spark_home.py
@@ -42,11 +42,15 @@ def _find_spark_home():
spark_dist_dir = "spark-distribution"
paths = [
"../", # When we're in spark/python.
- # Two case belows are valid when the current script is called as a library.
- os.path.join(os.path.dirname(os.path.realpath(__file__)), spark_dist_dir),
- os.path.dirname(os.path.realpath(__file__)),
]
+ if "__file__" in globals():
+ paths += [
+ # Two case belows are valid when the current script is called as a library.
+ os.path.join(os.path.dirname(os.path.realpath(__file__)), spark_dist_dir),
+ os.path.dirname(os.path.realpath(__file__)),
+ ]
+
# Add the path of the PySpark module if it exists
import_error_raised = False
from importlib.util import find_spec
diff --git a/python/pyspark/sql/connect/catalog.py b/python/pyspark/sql/connect/catalog.py
index f2bbae344f2..261f87b4cc6 100644
--- a/python/pyspark/sql/connect/catalog.py
+++ b/python/pyspark/sql/connect/catalog.py
@@ -16,7 +16,7 @@
#
from pyspark.sql.connect.utils import check_dependencies
-check_dependencies(__name__, __file__)
+check_dependencies(__name__)
from typing import Any, Callable, List, Optional, TYPE_CHECKING
diff --git a/python/pyspark/sql/connect/client.py b/python/pyspark/sql/connect/client.py
index 6334036fca4..baa6d641422 100644
--- a/python/pyspark/sql/connect/client.py
+++ b/python/pyspark/sql/connect/client.py
@@ -23,7 +23,7 @@ import string
from pyspark.sql.connect.utils import check_dependencies
-check_dependencies(__name__, __file__)
+check_dependencies(__name__)
import logging
import os
diff --git a/python/pyspark/sql/connect/column.py b/python/pyspark/sql/connect/column.py
index bc8b60beb97..d2be32b905e 100644
--- a/python/pyspark/sql/connect/column.py
+++ b/python/pyspark/sql/connect/column.py
@@ -16,7 +16,7 @@
#
from pyspark.sql.connect.utils import check_dependencies
-check_dependencies(__name__, __file__)
+check_dependencies(__name__)
import datetime
import decimal
diff --git a/python/pyspark/sql/connect/conversion.py b/python/pyspark/sql/connect/conversion.py
index 7b452de48f6..2b16fc7766d 100644
--- a/python/pyspark/sql/connect/conversion.py
+++ b/python/pyspark/sql/connect/conversion.py
@@ -16,7 +16,7 @@
#
from pyspark.sql.connect.utils import check_dependencies
-check_dependencies(__name__, __file__)
+check_dependencies(__name__)
import array
import datetime
diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py
index 38e245f0335..f8b92cdc7ae 100644
--- a/python/pyspark/sql/connect/dataframe.py
+++ b/python/pyspark/sql/connect/dataframe.py
@@ -16,7 +16,7 @@
#
from pyspark.sql.connect.utils import check_dependencies
-check_dependencies(__name__, __file__)
+check_dependencies(__name__)
from typing import (
Any,
diff --git a/python/pyspark/sql/connect/expressions.py b/python/pyspark/sql/connect/expressions.py
index 0d059740032..64176327c16 100644
--- a/python/pyspark/sql/connect/expressions.py
+++ b/python/pyspark/sql/connect/expressions.py
@@ -16,7 +16,7 @@
#
from pyspark.sql.connect.utils import check_dependencies
-check_dependencies(__name__, __file__)
+check_dependencies(__name__)
from typing import (
cast,
diff --git a/python/pyspark/sql/connect/functions.py b/python/pyspark/sql/connect/functions.py
index 268774e3211..c89b0ad3fc0 100644
--- a/python/pyspark/sql/connect/functions.py
+++ b/python/pyspark/sql/connect/functions.py
@@ -16,7 +16,7 @@
#
from pyspark.sql.connect.utils import check_dependencies
-check_dependencies(__name__, __file__)
+check_dependencies(__name__)
import inspect
import warnings
diff --git a/python/pyspark/sql/connect/group.py b/python/pyspark/sql/connect/group.py
index 8d876762804..e699ce7105a 100644
--- a/python/pyspark/sql/connect/group.py
+++ b/python/pyspark/sql/connect/group.py
@@ -16,7 +16,7 @@
#
from pyspark.sql.connect.utils import check_dependencies
-check_dependencies(__name__, __file__)
+check_dependencies(__name__)
from typing import (
Any,
diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py
index 8e5a96b974d..7444cd14c18 100644
--- a/python/pyspark/sql/connect/plan.py
+++ b/python/pyspark/sql/connect/plan.py
@@ -16,7 +16,7 @@
#
from pyspark.sql.connect.utils import check_dependencies
-check_dependencies(__name__, __file__)
+check_dependencies(__name__)
from typing import Any, List, Optional, Sequence, Union, cast, TYPE_CHECKING, Mapping, Dict
import functools
diff --git a/python/pyspark/sql/connect/readwriter.py b/python/pyspark/sql/connect/readwriter.py
index d20eaa44148..52a7a6c8cf5 100644
--- a/python/pyspark/sql/connect/readwriter.py
+++ b/python/pyspark/sql/connect/readwriter.py
@@ -16,7 +16,7 @@
#
from pyspark.sql.connect.utils import check_dependencies
-check_dependencies(__name__, __file__)
+check_dependencies(__name__)
from typing import Dict
from typing import Optional, Union, List, overload, Tuple, cast, Any
diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py
index dd1c2d3c510..475bd2fb6bd 100644
--- a/python/pyspark/sql/connect/session.py
+++ b/python/pyspark/sql/connect/session.py
@@ -16,7 +16,7 @@
#
from pyspark.sql.connect.utils import check_dependencies
-check_dependencies(__name__, __file__)
+check_dependencies(__name__)
import os
import warnings
diff --git a/python/pyspark/sql/connect/types.py b/python/pyspark/sql/connect/types.py
index 33ddd4e9929..8e91709b8fc 100644
--- a/python/pyspark/sql/connect/types.py
+++ b/python/pyspark/sql/connect/types.py
@@ -16,7 +16,7 @@
#
from pyspark.sql.connect.utils import check_dependencies
-check_dependencies(__name__, __file__)
+check_dependencies(__name__)
import json
diff --git a/python/pyspark/sql/connect/udf.py b/python/pyspark/sql/connect/udf.py
index 03e53cbd89e..bb7b70e613a 100644
--- a/python/pyspark/sql/connect/udf.py
+++ b/python/pyspark/sql/connect/udf.py
@@ -19,7 +19,7 @@ User-defined function related classes and functions
"""
from pyspark.sql.connect.utils import check_dependencies
-check_dependencies(__name__, __file__)
+check_dependencies(__name__)
import sys
import functools
diff --git a/python/pyspark/sql/connect/utils.py b/python/pyspark/sql/connect/utils.py
index fbc34aa8d59..eb9604d5fce 100644
--- a/python/pyspark/sql/connect/utils.py
+++ b/python/pyspark/sql/connect/utils.py
@@ -19,13 +19,13 @@ import sys
from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version
-def check_dependencies(mod_name: str, file_name: str) -> None:
+def check_dependencies(mod_name: str) -> None:
if mod_name == "__main__":
from pyspark.testing.connectutils import should_test_connect, connect_requirement_message
if not should_test_connect:
print(
- f"Skipping {file_name} doctests: {connect_requirement_message}",
+ f"Skipping {mod_name} doctests: {connect_requirement_message}",
file=sys.stderr,
)
sys.exit(0)
diff --git a/python/pyspark/sql/connect/window.py b/python/pyspark/sql/connect/window.py
index 51a9452e611..fa50fd97e31 100644
--- a/python/pyspark/sql/connect/window.py
+++ b/python/pyspark/sql/connect/window.py
@@ -16,7 +16,7 @@
#
from pyspark.sql.connect.utils import check_dependencies
-check_dependencies(__name__, __file__)
+check_dependencies(__name__)
import sys
from typing import TYPE_CHECKING, Union, Sequence, List, Optional
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index c1c3669701f..cd5bb649c8b 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -717,15 +717,16 @@ def main(infile, outfile):
lineno = (
getframeinfo(currentframe()).lineno + 1 if currentframe() is not None else 0
)
- print(
- warnings.formatwarning(
- "Failed to set memory limit: {0}".format(e),
- ResourceWarning,
- __file__,
- lineno,
- ),
- file=sys.stderr,
- )
+ if "__file__" in globals():
+ print(
+ warnings.formatwarning(
+ "Failed to set memory limit: {0}".format(e),
+ ResourceWarning,
+ __file__,
+ lineno,
+ ),
+ file=sys.stderr,
+ )
# initialize global state
taskContext = None
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org