You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/04/07 11:51:43 UTC
[spark] branch master updated: [SPARK-34972][PYTHON] Make
pandas-on-Spark doctests work
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 2635c38 [SPARK-34972][PYTHON] Make pandas-on-Spark doctests work
2635c38 is described below
commit 2635c3894ff935bf1cd2d86648a28dcb4dc3dc73
Author: Takuya UESHIN <ue...@databricks.com>
AuthorDate: Wed Apr 7 20:50:41 2021 +0900
[SPARK-34972][PYTHON] Make pandas-on-Spark doctests work
### What changes were proposed in this pull request?
Now that we merged the Koalas main code into PySpark code base (#32036), we should enable doctests on the Spark's infrastructure.
### Why are the changes needed?
Currently the pandas-on-Spark modules are not tested at all.
We should enable doctests first, and we will port other unit tests separately later.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Enabled the whole doctests.
Closes #32069 from ueshin/issues/SPARK-34972/pyspark-pandas_doctests.
Authored-by: Takuya UESHIN <ue...@databricks.com>
Signed-off-by: HyukjinKwon <gu...@apache.org>
---
.github/workflows/build_and_test.yml | 2 +
dev/run-tests.py | 6 +--
dev/sparktestsupport/modules.py | 44 ++++++++++++++++
python/pyspark/pandas/__init__.py | 17 ++++--
python/pyspark/pandas/accessors.py | 30 +++++++++++
python/pyspark/pandas/base.py | 28 ++++++++++
python/pyspark/pandas/categorical.py | 30 +++++++++++
python/pyspark/pandas/config.py | 28 ++++++++++
python/pyspark/pandas/datetimes.py | 30 +++++++++++
python/pyspark/pandas/exceptions.py | 30 +++++++++++
python/pyspark/pandas/extensions.py | 32 ++++++++++++
python/pyspark/pandas/frame.py | 53 +++++++++++++++++--
python/pyspark/pandas/generic.py | 38 ++++++++++++++
python/pyspark/pandas/groupby.py | 40 +++++++++++++--
python/pyspark/pandas/indexes/base.py | 30 +++++++++++
python/pyspark/pandas/indexes/category.py | 30 +++++++++++
python/pyspark/pandas/indexes/datetimes.py | 30 +++++++++++
python/pyspark/pandas/indexes/multi.py | 32 ++++++++++++
python/pyspark/pandas/indexes/numeric.py | 30 +++++++++++
python/pyspark/pandas/indexing.py | 30 +++++++++++
python/pyspark/pandas/internal.py | 30 +++++++++++
python/pyspark/pandas/ml.py | 24 +++++++++
python/pyspark/pandas/mlflow.py | 39 +++++++++++++-
python/pyspark/pandas/namespace.py | 60 +++++++++++++++++++---
python/pyspark/pandas/numpy_compat.py | 30 +++++++++++
python/pyspark/pandas/series.py | 30 ++++++++++-
python/pyspark/pandas/spark/accessors.py | 48 +++++++++++++++++
python/pyspark/pandas/spark/utils.py | 19 +++++++
python/pyspark/pandas/{sql.py => sql_processor.py} | 30 +++++++++++
python/pyspark/pandas/strings.py | 30 +++++++++++
python/pyspark/pandas/typedef/typehints.py | 19 +++++++
python/pyspark/pandas/usage_logging/__init__.py | 6 +--
python/pyspark/pandas/utils.py | 28 ++++++++++
python/pyspark/pandas/window.py | 28 ++++++++++
34 files changed, 983 insertions(+), 28 deletions(-)
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 9253d58..3abe206 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -161,6 +161,8 @@ jobs:
pyspark-sql, pyspark-mllib, pyspark-resource
- >-
pyspark-core, pyspark-streaming, pyspark-ml
+ - >-
+ pyspark-pandas
env:
MODULES_TO_TEST: ${{ matrix.modules }}
HADOOP_PROFILE: hadoop3.2
diff --git a/dev/run-tests.py b/dev/run-tests.py
index 83f9f02..c5b412d 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -123,17 +123,17 @@ def determine_modules_to_test(changed_modules, deduplicated=True):
>>> [x.name for x in determine_modules_to_test([modules.sql])]
... # doctest: +NORMALIZE_WHITESPACE
['sql', 'avro', 'hive', 'mllib', 'sql-kafka-0-10', 'examples', 'hive-thriftserver',
- 'pyspark-sql', 'repl', 'sparkr', 'pyspark-mllib', 'pyspark-ml']
+ 'pyspark-sql', 'repl', 'sparkr', 'pyspark-mllib', 'pyspark-pandas', 'pyspark-ml']
>>> sorted([x.name for x in determine_modules_to_test(
... [modules.sparkr, modules.sql], deduplicated=False)])
... # doctest: +NORMALIZE_WHITESPACE
['avro', 'examples', 'hive', 'hive-thriftserver', 'mllib', 'pyspark-ml',
- 'pyspark-mllib', 'pyspark-sql', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
+ 'pyspark-mllib', 'pyspark-pandas', 'pyspark-sql', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
>>> sorted([x.name for x in determine_modules_to_test(
... [modules.sql, modules.core], deduplicated=False)])
... # doctest: +NORMALIZE_WHITESPACE
['avro', 'catalyst', 'core', 'examples', 'graphx', 'hive', 'hive-thriftserver',
- 'mllib', 'mllib-local', 'pyspark-core', 'pyspark-ml', 'pyspark-mllib',
+ 'mllib', 'mllib-local', 'pyspark-core', 'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas',
'pyspark-resource', 'pyspark-sql', 'pyspark-streaming', 'repl', 'root',
'sparkr', 'sql', 'sql-kafka-0-10', 'streaming', 'streaming-kafka-0-10',
'streaming-kinesis-asl']
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 87bfbdf..6823415 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -573,6 +573,50 @@ pyspark_ml = Module(
]
)
+pyspark_pandas = Module(
+ name="pyspark-pandas",
+ dependencies=[pyspark_core, pyspark_sql],
+ source_file_regexes=[
+ "python/pyspark/pandas/"
+ ],
+ python_test_goals=[
+ # doctests
+ "pyspark.pandas.accessors",
+ "pyspark.pandas.base",
+ "pyspark.pandas.categorical",
+ "pyspark.pandas.config",
+ "pyspark.pandas.datetimes",
+ "pyspark.pandas.exceptions",
+ "pyspark.pandas.extensions",
+ "pyspark.pandas.frame",
+ "pyspark.pandas.generic",
+ "pyspark.pandas.groupby",
+ "pyspark.pandas.indexing",
+ "pyspark.pandas.internal",
+ "pyspark.pandas.ml",
+ "pyspark.pandas.mlflow",
+ "pyspark.pandas.namespace",
+ "pyspark.pandas.numpy_compat",
+ "pyspark.pandas.series",
+ "pyspark.pandas.sql_processor",
+ "pyspark.pandas.strings",
+ "pyspark.pandas.utils",
+ "pyspark.pandas.window",
+ "pyspark.pandas.indexes.base",
+ "pyspark.pandas.indexes.category",
+ "pyspark.pandas.indexes.datetimes",
+ "pyspark.pandas.indexes.multi",
+ "pyspark.pandas.indexes.numeric",
+ "pyspark.pandas.spark.accessors",
+ "pyspark.pandas.spark.utils",
+ "pyspark.pandas.typedef.typehints",
+ ],
+ excluded_python_implementations=[
+ "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
+ # they aren't available there
+ ]
+)
+
sparkr = Module(
name="sparkr",
dependencies=[hive, mllib],
diff --git a/python/pyspark/pandas/__init__.py b/python/pyspark/pandas/__init__.py
index a605954..1e7363c 100644
--- a/python/pyspark/pandas/__init__.py
+++ b/python/pyspark/pandas/__init__.py
@@ -17,13 +17,24 @@
import os
import sys
from distutils.version import LooseVersion
+import warnings
+
+from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version
+
+try:
+ require_minimum_pandas_version()
+ require_minimum_pyarrow_version()
+except ImportError as e:
+ if os.environ.get("SPARK_TESTING"):
+ warnings.warn(str(e))
+ sys.exit()
+ else:
+ raise
from pyspark.pandas.version import __version__ # noqa: F401
def assert_python_version():
- import warnings
-
major = 3
minor = 5
deprecated_version = (major, minor)
@@ -206,4 +217,4 @@ _auto_patch_pandas()
# Import after the usage logger is attached.
from pyspark.pandas.config import get_option, options, option_context, reset_option, set_option
from pyspark.pandas.namespace import * # F405
-from pyspark.pandas.sql import sql
+from pyspark.pandas.sql_processor import sql
diff --git a/python/pyspark/pandas/accessors.py b/python/pyspark/pandas/accessors.py
index 39a647c0..bcc3e76 100644
--- a/python/pyspark/pandas/accessors.py
+++ b/python/pyspark/pandas/accessors.py
@@ -928,3 +928,33 @@ class KoalasSeriesMethods(object):
),
dtype=dtype,
)
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.accessors
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.accessors.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]")
+ .appName("pyspark.pandas.accessors tests")
+ .getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.accessors,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py
index 6ae27a8..58a3a9c 100644
--- a/python/pyspark/pandas/base.py
+++ b/python/pyspark/pandas/base.py
@@ -1993,3 +1993,31 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
uniques = pd.Index(uniques_list)
return codes, uniques
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.base
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.base.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]").appName("pyspark.pandas.base tests").getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.base,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/categorical.py b/python/pyspark/pandas/categorical.py
index 87d56d0..c88e888 100644
--- a/python/pyspark/pandas/categorical.py
+++ b/python/pyspark/pandas/categorical.py
@@ -162,3 +162,33 @@ class CategoricalAccessor(object):
self, new_categories, ordered: bool = None, rename: bool = False, inplace: bool = False
):
raise NotImplementedError()
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.categorical
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.categorical.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]")
+ .appName("pyspark.pandas.categorical tests")
+ .getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.categorical,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/config.py b/python/pyspark/pandas/config.py
index 7f011a2..4d0a3cd 100644
--- a/python/pyspark/pandas/config.py
+++ b/python/pyspark/pandas/config.py
@@ -440,3 +440,31 @@ class DictWrapper:
options = DictWrapper(_options_dict)
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.config
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.config.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]").appName("pyspark.pandas.config tests").getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.config,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/datetimes.py b/python/pyspark/pandas/datetimes.py
index a5d3f38..bc93094 100644
--- a/python/pyspark/pandas/datetimes.py
+++ b/python/pyspark/pandas/datetimes.py
@@ -848,3 +848,33 @@ class DatetimeMethods(object):
return s.dt.day_name(locale=locale)
return self._data.koalas.transform_batch(pandas_day_name)
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.datetimes
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.datetimes.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]")
+ .appName("pyspark.pandas.datetimes tests")
+ .getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.datetimes,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/exceptions.py b/python/pyspark/pandas/exceptions.py
index 7ec874b..7be362a 100644
--- a/python/pyspark/pandas/exceptions.py
+++ b/python/pyspark/pandas/exceptions.py
@@ -104,3 +104,33 @@ class PandasNotImplementedError(NotImplementedError):
class_name, property_name, reason
)
super().__init__(msg)
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.exceptions
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.exceptions.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]")
+ .appName("pyspark.pandas.exceptions tests")
+ .getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.exceptions,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/extensions.py b/python/pyspark/pandas/extensions.py
index 9b67f2a..2ce2738 100644
--- a/python/pyspark/pandas/extensions.py
+++ b/python/pyspark/pandas/extensions.py
@@ -340,3 +340,35 @@ def register_index_accessor(name):
from pyspark.pandas import Index
return _register_accessor(name, Index)
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ import numpy
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.extensions
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.extensions.__dict__.copy()
+ globs["np"] = numpy
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]")
+ .appName("pyspark.pandas.extensions tests")
+ .getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.extensions,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index 4e7b9fa..96eb439 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -387,7 +387,7 @@ def _create_tuple_for_frame_type(params):
return Tuple[tuple(new_params)]
-if (3, 5) <= sys.version_info < (3, 7):
+if (3, 5) <= sys.version_info < (3, 7) and __name__ != "__main__":
from typing import GenericMeta # type: ignore
# This is a workaround to support variadic generic in DataFrame in Python 3.5+.
@@ -3506,7 +3506,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
Examples
--------
- >>> pp.range(1001).style # doctest: +ELLIPSIS
+ >>> pp.range(1001).style # doctest: +SKIP
<pandas.io.formats.style.Styler object at ...>
"""
max_results = get_option("compute.max_rows")
@@ -4694,16 +4694,17 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
Create a new Delta Lake table, partitioned by one column:
- >>> df.to_delta('%s/to_delta/foo' % path, partition_cols='date')
+ >>> df.to_delta('%s/to_delta/foo' % path, partition_cols='date') # doctest: +SKIP
Partitioned by two columns:
- >>> df.to_delta('%s/to_delta/bar' % path, partition_cols=['date', 'country'])
+ >>> df.to_delta('%s/to_delta/bar' % path,
+ ... partition_cols=['date', 'country']) # doctest: +SKIP
Overwrite an existing table's partitions, using the 'replaceWhere' capability in Delta:
>>> df.to_delta('%s/to_delta/bar' % path,
- ... mode='overwrite', replaceWhere='date >= "2012-01-01"')
+ ... mode='overwrite', replaceWhere='date >= "2012-01-01"') # doctest: +SKIP
"""
if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1:
options = options.get("options") # type: ignore
@@ -11974,3 +11975,45 @@ class CachedDataFrame(DataFrame):
return self.spark.unpersist()
unpersist.__doc__ = CachedSparkFrameMethods.unpersist.__doc__
+
+
+def _test():
+ import os
+ import doctest
+ import shutil
+ import sys
+ import tempfile
+ import uuid
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.frame
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.frame.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]").appName("pyspark.pandas.frame tests").getOrCreate()
+ )
+
+ db_name = "db%s" % str(uuid.uuid4()).replace("-", "")
+ spark.sql("CREATE DATABASE %s" % db_name)
+ globs["db"] = db_name
+
+ path = tempfile.mkdtemp()
+ globs["path"] = path
+
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.frame,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+
+ shutil.rmtree(path, ignore_errors=True)
+ spark.sql("DROP DATABASE IF EXISTS %s CASCADE" % db_name)
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py
index 46d6051..2d3e0ab 100644
--- a/python/pyspark/pandas/generic.py
+++ b/python/pyspark/pandas/generic.py
@@ -3100,3 +3100,41 @@ class Frame(object, metaclass=ABCMeta):
return F.count(F.nanvl(spark_column, F.lit(None)))
else:
return F.count(spark_column)
+
+
+def _test():
+ import os
+ import doctest
+ import shutil
+ import sys
+ import tempfile
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.generic
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.generic.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]")
+ .appName("pyspark.pandas.generic tests")
+ .getOrCreate()
+ )
+
+ path = tempfile.mkdtemp()
+ globs["path"] = path
+
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.generic,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+
+ shutil.rmtree(path, ignore_errors=True)
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index c3fe2d8..90ad41c 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -1894,12 +1894,12 @@ class GroupBy(object, metaclass=ABCMeta):
... 'b': [2, 3, 1, 4, 6, 9, 8, 10, 7, 5],
... 'c': [3, 5, 2, 5, 1, 2, 6, 4, 3, 6]},
... columns=['a', 'b', 'c'],
- ... index=[7, 2, 4, 1, 3, 4, 9, 10, 5, 6])
+ ... index=[7, 2, 3, 1, 3, 4, 9, 10, 5, 6])
>>> df
a b c
7 1 2 3
2 1 3 5
- 4 1 1 2
+ 3 1 1 2
1 1 4 5
3 2 6 1
4 2 9 2
@@ -1911,16 +1911,16 @@ class GroupBy(object, metaclass=ABCMeta):
>>> df.groupby('a').tail(2).sort_index()
a b c
1 1 4 5
+ 3 1 1 2
4 2 9 2
- 4 1 1 2
5 3 7 3
6 3 5 6
9 2 8 6
>>> df.groupby('a')['b'].tail(2).sort_index()
1 4
+ 3 1
4 9
- 4 1
5 7
6 5
9 8
@@ -3184,3 +3184,35 @@ def normalize_keyword_aggregation(kwargs):
if isinstance(order[0][0], tuple):
order = [(*levs, method) for levs, method in order]
return aggspec, columns, order
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ import numpy
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.groupby
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.groupby.__dict__.copy()
+ globs["np"] = numpy
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]")
+ .appName("pyspark.pandas.groupby tests")
+ .getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.groupby,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/indexes/base.py b/python/pyspark/pandas/indexes/base.py
index 1b224df..170fa6c 100644
--- a/python/pyspark/pandas/indexes/base.py
+++ b/python/pyspark/pandas/indexes/base.py
@@ -2469,3 +2469,33 @@ class Index(IndexOpsMixin):
"The truth value of a {0} is ambiguous. "
"Use a.empty, a.bool(), a.item(), a.any() or a.all().".format(self.__class__.__name__)
)
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.indexes.base
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.indexes.base.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]")
+ .appName("pyspark.pandas.indexes.base tests")
+ .getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.indexes.base,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/indexes/category.py b/python/pyspark/pandas/indexes/category.py
index 8bdec59..cb4e3bd 100644
--- a/python/pyspark/pandas/indexes/category.py
+++ b/python/pyspark/pandas/indexes/category.py
@@ -186,3 +186,33 @@ class CategoricalIndex(Index):
else:
return partial(property_or_func, self)
raise AttributeError("'CategoricalIndex' object has no attribute '{}'".format(item))
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.indexes.category
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.indexes.category.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]")
+ .appName("pyspark.pandas.indexes.category tests")
+ .getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.indexes.category,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/indexes/datetimes.py b/python/pyspark/pandas/indexes/datetimes.py
index 88ba7d2..b39196b 100644
--- a/python/pyspark/pandas/indexes/datetimes.py
+++ b/python/pyspark/pandas/indexes/datetimes.py
@@ -740,3 +740,33 @@ class DatetimeIndex(Index):
def disallow_nanoseconds(freq):
if freq in ["N", "ns"]:
raise ValueError("nanoseconds is not supported")
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.indexes.datetimes
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.indexes.datetimes.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]")
+ .appName("pyspark.pandas.indexes.datetimes tests")
+ .getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.indexes.datetimes,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py
index ee8ea1e..623c83f 100644
--- a/python/pyspark/pandas/indexes/multi.py
+++ b/python/pyspark/pandas/indexes/multi.py
@@ -1168,3 +1168,35 @@ class MultiIndex(Index):
def __iter__(self):
return MissingPandasLikeMultiIndex.__iter__(self)
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ import numpy
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.indexes.multi
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.indexes.multi.__dict__.copy()
+ globs["np"] = numpy
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]")
+ .appName("pyspark.pandas.indexes.multi tests")
+ .getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.indexes.multi,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/indexes/numeric.py b/python/pyspark/pandas/indexes/numeric.py
index 1acad3f..f28bb61 100644
--- a/python/pyspark/pandas/indexes/numeric.py
+++ b/python/pyspark/pandas/indexes/numeric.py
@@ -145,3 +145,33 @@ class Float64Index(NumericIndex):
return Index(data, dtype=dtype, copy=copy, name=name)
return pp.from_pandas(pd.Float64Index(data=data, dtype=dtype, copy=copy, name=name))
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.indexes.numeric
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.indexes.numeric.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]")
+ .appName("pyspark.pandas.indexes.numeric tests")
+ .getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.indexes.numeric,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/indexing.py b/python/pyspark/pandas/indexing.py
index 0016d46..326fce1 100644
--- a/python/pyspark/pandas/indexing.py
+++ b/python/pyspark/pandas/indexing.py
@@ -1706,3 +1706,33 @@ class iLocIndexer(LocIndexerLike):
# Clean up implicitly cached properties to be able to reuse the indexer.
del self._internal
del self._sequence_col
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.indexing
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.indexing.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]")
+ .appName("pyspark.pandas.indexing tests")
+ .getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.indexing,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/internal.py b/python/pyspark/pandas/internal.py
index 94abe97..c7fa438 100644
--- a/python/pyspark/pandas/internal.py
+++ b/python/pyspark/pandas/internal.py
@@ -1436,3 +1436,33 @@ class InternalFrame(object):
reset_index[name] = col.replace({np.nan: None})
return reset_index, index_columns, index_dtypes, data_columns, data_dtypes
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.internal
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.internal.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]")
+ .appName("pyspark.pandas.internal tests")
+ .getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.internal,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/ml.py b/python/pyspark/pandas/ml.py
index 6ea2cf3..912644f 100644
--- a/python/pyspark/pandas/ml.py
+++ b/python/pyspark/pandas/ml.py
@@ -89,3 +89,27 @@ def to_numeric_df(kdf: "pp.DataFrame") -> Tuple[pyspark.sql.DataFrame, List[Tupl
va = VectorAssembler(inputCols=numeric_df.columns, outputCol=CORRELATION_OUTPUT_COLUMN)
v = va.transform(numeric_df).select(CORRELATION_OUTPUT_COLUMN)
return v, numeric_column_labels
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.ml
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.ml.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = SparkSession.builder.master("local[4]").appName("pyspark.pandas.ml tests").getOrCreate()
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.ml, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/mlflow.py b/python/pyspark/pandas/mlflow.py
index 15ff066..e073b1d 100644
--- a/python/pyspark/pandas/mlflow.py
+++ b/python/pyspark/pandas/mlflow.py
@@ -18,7 +18,6 @@
"""
MLflow-related functions to load models and apply them to Koalas dataframes.
"""
-from mlflow import pyfunc
from pyspark.sql.types import DataType
import pandas as pd
import numpy as np
@@ -62,10 +61,14 @@ class PythonModelWrapper(object):
"""
The return object has to follow the API of mlflow.pyfunc.PythonModel.
"""
+ from mlflow import pyfunc
+
return pyfunc.load_model(model_uri=self._model_uri)
@lazy_property
def _model_udf(self):
+ from mlflow import pyfunc
+
spark = default_session()
return pyfunc.spark_udf(spark, model_uri=self._model_uri, result_type=self._return_type)
@@ -190,3 +193,37 @@ def load_model(model_uri, predict_type="infer") -> PythonModelWrapper:
0 2.0 3.0 -1 1.376932
"""
return PythonModelWrapper(model_uri, predict_type)
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.mlflow
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.mlflow.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]").appName("pyspark.pandas.mlflow tests").getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.mlflow,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ try:
+ import mlflow # noqa: F401
+ import sklearn # noqa: F401
+
+ _test()
+ except ImportError:
+ pass
diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py
index 3a14d6a..1a4ba49 100644
--- a/python/pyspark/pandas/namespace.py
+++ b/python/pyspark/pandas/namespace.py
@@ -512,13 +512,14 @@ def read_delta(
Examples
--------
- >>> pp.range(1).to_delta('%s/read_delta/foo' % path)
- >>> pp.read_delta('%s/read_delta/foo' % path)
+ >>> pp.range(1).to_delta('%s/read_delta/foo' % path) # doctest: +SKIP
+ >>> pp.read_delta('%s/read_delta/foo' % path) # doctest: +SKIP
id
0 0
- >>> pp.range(10, 15, num_partitions=1).to_delta('%s/read_delta/foo' % path, mode='overwrite')
- >>> pp.read_delta('%s/read_delta/foo' % path)
+ >>> pp.range(10, 15, num_partitions=1).to_delta('%s/read_delta/foo' % path,
+ ... mode='overwrite') # doctest: +SKIP
+ >>> pp.read_delta('%s/read_delta/foo' % path) # doctest: +SKIP
id
0 10
1 11
@@ -526,16 +527,15 @@ def read_delta(
3 13
4 14
- >>> pp.read_delta('%s/read_delta/foo' % path, version=0)
+ >>> pp.read_delta('%s/read_delta/foo' % path, version=0) # doctest: +SKIP
id
0 0
You can preserve the index in the roundtrip as below.
>>> pp.range(10, 15, num_partitions=1).to_delta(
- ... '%s/read_delta/bar' % path, index_col="index")
- >>> pp.read_delta('%s/read_delta/bar' % path, index_col="index")
- ... # doctest: +NORMALIZE_WHITESPACE
+ ... '%s/read_delta/bar' % path, index_col="index") # doctest: +SKIP
+ >>> pp.read_delta('%s/read_delta/bar' % path, index_col="index") # doctest: +SKIP
id
index
0 10
@@ -2875,3 +2875,47 @@ _get_dummies_acceptable_types = _get_dummies_default_accept_types + (
BooleanType,
TimestampType,
)
+
+
+def _test():
+ import os
+ import doctest
+ import shutil
+ import sys
+ import tempfile
+ import uuid
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.namespace
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.namespace.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]")
+ .appName("pyspark.pandas.namespace tests")
+ .getOrCreate()
+ )
+
+ db_name = "db%s" % str(uuid.uuid4()).replace("-", "")
+ spark.sql("CREATE DATABASE %s" % db_name)
+ globs["db"] = db_name
+
+ path = tempfile.mkdtemp()
+ globs["path"] = path
+
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.namespace,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+
+ shutil.rmtree(path, ignore_errors=True)
+ spark.sql("DROP DATABASE IF EXISTS %s CASCADE" % db_name)
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/numpy_compat.py b/python/pyspark/pandas/numpy_compat.py
index 3ab4073..8d04896 100644
--- a/python/pyspark/pandas/numpy_compat.py
+++ b/python/pyspark/pandas/numpy_compat.py
@@ -208,3 +208,33 @@ def maybe_dispatch_ufunc_to_spark_func(
return column_op(convert_arguments)(*inputs) # type: ignore
else:
return NotImplemented
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.numpy_compat
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.numpy_compat.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]")
+ .appName("pyspark.pandas.numpy_compat tests")
+ .getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.numpy_compat,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index 4f513a9..44de907 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -336,7 +336,7 @@ def _create_type_for_series_type(param):
return SeriesType[new_class]
-if (3, 5) <= sys.version_info < (3, 7):
+if (3, 5) <= sys.version_info < (3, 7) and __name__ != "__main__":
from typing import GenericMeta # type: ignore
old_getitem = GenericMeta.__getitem__ # type: ignore
@@ -6233,3 +6233,31 @@ def first_series(df) -> Union["Series", pd.Series]:
return df._kser_for(df._internal.column_labels[0])
else:
return df[df.columns[0]]
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.series
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.series.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]").appName("pyspark.pandas.series tests").getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.series,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/spark/accessors.py b/python/pyspark/pandas/spark/accessors.py
index 543e3f4..8b835d3 100644
--- a/python/pyspark/pandas/spark/accessors.py
+++ b/python/pyspark/pandas/spark/accessors.py
@@ -1247,3 +1247,51 @@ class CachedSparkFrameMethods(SparkFrameMethods):
"""
if self._kdf._cached.is_cached:
self._kdf._cached.unpersist()
+
+
+def _test():
+ import os
+ import doctest
+ import shutil
+ import sys
+ import tempfile
+ import uuid
+ import numpy
+ import pandas
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.spark.accessors
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.spark.accessors.__dict__.copy()
+ globs["np"] = numpy
+ globs["pd"] = pandas
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]")
+ .appName("pyspark.pandas.spark.accessors tests")
+ .getOrCreate()
+ )
+
+ db_name = "db%s" % str(uuid.uuid4()).replace("-", "")
+ spark.sql("CREATE DATABASE %s" % db_name)
+ globs["db"] = db_name
+
+ path = tempfile.mkdtemp()
+ globs["path"] = path
+
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.spark.accessors,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+
+ shutil.rmtree(path, ignore_errors=True)
+ spark.sql("DROP DATABASE IF EXISTS %s CASCADE" % db_name)
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/spark/utils.py b/python/pyspark/pandas/spark/utils.py
index 46505a9..09aed45 100644
--- a/python/pyspark/pandas/spark/utils.py
+++ b/python/pyspark/pandas/spark/utils.py
@@ -122,3 +122,22 @@ StructField(B,DecimalType(30,15),false)))
return DecimalType(precision=precision, scale=scale)
else:
return dt
+
+
+def _test():
+ import doctest
+ import sys
+ import pyspark.pandas.spark.utils
+
+ globs = pyspark.pandas.spark.utils.__dict__.copy()
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.spark.utils,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/sql.py b/python/pyspark/pandas/sql_processor.py
similarity index 93%
rename from python/pyspark/pandas/sql.py
rename to python/pyspark/pandas/sql_processor.py
index aec2ae9..f26e82f 100644
--- a/python/pyspark/pandas/sql.py
+++ b/python/pyspark/pandas/sql_processor.py
@@ -300,3 +300,33 @@ class SQLProcessor(object):
if isinstance(var, (tuple, range)):
return self._convert_var(list(var))
raise ValueError("Unsupported variable type {}: {}".format(type(var).__name__, str(var)))
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.sql_processor
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.sql_processor.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]")
+ .appName("pyspark.pandas.sql_processor tests")
+ .getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.sql_processor,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/strings.py b/python/pyspark/pandas/strings.py
index 3945184..a57f372 100644
--- a/python/pyspark/pandas/strings.py
+++ b/python/pyspark/pandas/strings.py
@@ -2287,3 +2287,33 @@ class StringMethods(object):
Not supported.
"""
raise NotImplementedError()
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.strings
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.strings.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]")
+ .appName("pyspark.pandas.strings tests")
+ .getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.strings,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/typedef/typehints.py b/python/pyspark/pandas/typedef/typehints.py
index 657c9a7..5d461f6 100644
--- a/python/pyspark/pandas/typedef/typehints.py
+++ b/python/pyspark/pandas/typedef/typehints.py
@@ -519,3 +519,22 @@ def infer_return_type(f) -> Union[SeriesType, DataFrameType, ScalarType, Unknown
return UnknownType(tpe)
else:
return ScalarType(*types)
+
+
+def _test():
+ import doctest
+ import sys
+ import pyspark.pandas.typedef.typehints
+
+ globs = pyspark.pandas.typedef.typehints.__dict__.copy()
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.typedef.typehints,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/usage_logging/__init__.py b/python/pyspark/pandas/usage_logging/__init__.py
index cd04d99..300f31a 100644
--- a/python/pyspark/pandas/usage_logging/__init__.py
+++ b/python/pyspark/pandas/usage_logging/__init__.py
@@ -25,7 +25,7 @@ from typing import Union
import pandas as pd
-from pyspark.pandas import config, namespace, sql
+from pyspark.pandas import config, namespace, sql_processor
from pyspark.pandas.accessors import KoalasFrameMethods
from pyspark.pandas.frame import DataFrame
from pyspark.pandas.datetimes import DatetimeMethods
@@ -114,8 +114,8 @@ def attach(logger_module: Union[str, ModuleType]) -> None:
except ImportError:
pass
- sql._CAPTURE_SCOPES = 3 # type: ignore
- modules.append(sql) # type: ignore
+ sql_processor._CAPTURE_SCOPES = 3 # type: ignore
+ modules.append(sql_processor) # type: ignore
# Modules
for target_module in modules:
diff --git a/python/pyspark/pandas/utils.py b/python/pyspark/pandas/utils.py
index 002bac94..dfbe766 100644
--- a/python/pyspark/pandas/utils.py
+++ b/python/pyspark/pandas/utils.py
@@ -876,3 +876,31 @@ def compare_disallow_null(left, right, comp):
def compare_allow_null(left, right, comp):
return left.isNull() | right.isNull() | comp(left, right)
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.utils
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.utils.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]").appName("pyspark.pandas.utils tests").getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.utils,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/pandas/window.py b/python/pyspark/pandas/window.py
index c42f074..a4ba4ac 100644
--- a/python/pyspark/pandas/window.py
+++ b/python/pyspark/pandas/window.py
@@ -1737,3 +1737,31 @@ class ExpandingGroupby(Expanding):
numpy.var : Equivalent method for Numpy array.
"""
return super().var()
+
+
+def _test():
+ import os
+ import doctest
+ import sys
+ from pyspark.sql import SparkSession
+ import pyspark.pandas.window
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ globs = pyspark.pandas.window.__dict__.copy()
+ globs["pp"] = pyspark.pandas
+ spark = (
+ SparkSession.builder.master("local[4]").appName("pyspark.pandas.window tests").getOrCreate()
+ )
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.pandas.window,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+ )
+ spark.stop()
+ if failure_count:
+ sys.exit(-1)
+
+
+if __name__ == "__main__":
+ _test()
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org