You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2022/08/12 14:05:02 UTC
[spark] branch master updated: [SPARK-40057][PYTHON][DOCS] Cleanup "" in doctest
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 350a74fe8d5 [SPARK-40057][PYTHON][DOCS] Cleanup "<BLANKLINE>" in doctest
350a74fe8d5 is described below
commit 350a74fe8d5d5f0f82dac4e3123e71d896bdf09a
Author: Yikun Jiang <yi...@gmail.com>
AuthorDate: Fri Aug 12 23:04:47 2022 +0900
[SPARK-40057][PYTHON][DOCS] Cleanup "<BLANKLINE>" in doctest
### What changes were proposed in this pull request?
Cleanup `<BLANKLINE>` in doctest
### Why are the changes needed?
See https://github.com/apache/spark/pull/37465#discussion_r943071168, we'd better to cleanup all `<BLANKLINE>` in doctest to make doctest code more clear.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
CI passed
Closes #37492 from Yikun/blankline.
Authored-by: Yikun Jiang <yi...@gmail.com>
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
python/pyspark/ml/stat.py | 8 +++-----
python/pyspark/mllib/tree.py | 9 +++------
python/pyspark/pandas/frame.py | 1 -
python/pyspark/pandas/series.py | 38 +++++++++++++++++---------------------
python/pyspark/sql/dataframe.py | 1 -
5 files changed, 23 insertions(+), 34 deletions(-)
diff --git a/python/pyspark/ml/stat.py b/python/pyspark/ml/stat.py
index b91ef1b6cb3..704d2dc9baa 100644
--- a/python/pyspark/ml/stat.py
+++ b/python/pyspark/ml/stat.py
@@ -274,28 +274,24 @@ class Summarizer:
+-----------------------------------+
|{[1.0,1.0,1.0], 1} |
+-----------------------------------+
- <BLANKLINE>
>>> df.select(summarizer.summary(df.features)).show(truncate=False)
+--------------------------------+
|aggregate_metrics(features, 1.0)|
+--------------------------------+
|{[1.0,1.5,2.0], 2} |
+--------------------------------+
- <BLANKLINE>
>>> df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False)
+--------------+
|mean(features)|
+--------------+
|[1.0,1.0,1.0] |
+--------------+
- <BLANKLINE>
>>> df.select(Summarizer.mean(df.features)).show(truncate=False)
+--------------+
|mean(features)|
+--------------+
|[1.0,1.5,2.0] |
+--------------+
- <BLANKLINE>
"""
@staticmethod
@@ -519,7 +515,9 @@ if __name__ == "__main__":
globs["sc"] = sc
globs["spark"] = spark
- failure_count, test_count = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+ failure_count, test_count = doctest.testmod(
+ globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE
+ )
spark.stop()
if failure_count:
sys.exit(-1)
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index e1d87e99c8a..8a5c25d96a7 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -273,7 +273,6 @@ class DecisionTree:
Predict: 0.0
Else (feature 0 > 0.5)
Predict: 1.0
- <BLANKLINE>
>>> model.predict(array([1.0]))
1.0
>>> model.predict(array([0.0]))
@@ -511,10 +510,8 @@ class RandomForest:
7
>>> print(model)
TreeEnsembleModel classifier with 3 trees
- <BLANKLINE>
>>> print(model.toDebugString())
TreeEnsembleModel classifier with 3 trees
- <BLANKLINE>
Tree 0:
Predict: 1.0
Tree 1:
@@ -527,7 +524,6 @@ class RandomForest:
Predict: 0.0
Else (feature 0 > 1.5)
Predict: 1.0
- <BLANKLINE>
>>> model.predict([2.0])
1.0
>>> model.predict([0.0])
@@ -764,7 +760,6 @@ class GradientBoostedTrees:
30
>>> print(model) # it already has newline
TreeEnsembleModel classifier with 10 trees
- <BLANKLINE>
>>> model.predict([2.0])
1.0
>>> model.predict([0.0])
@@ -881,7 +876,9 @@ def _test() -> None:
spark = SparkSession.builder.master("local[4]").appName("mllib.tree tests").getOrCreate()
globs["sc"] = spark.sparkContext
- (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+ (failure_count, test_count) = doctest.testmod(
+ globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE
+ )
spark.stop()
if failure_count:
sys.exit(-1)
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index 465541abdaa..b3ded9885fc 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -2217,7 +2217,6 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
Donatello & purple & bo staff \\
\bottomrule
\end{tabular}
- <BLANKLINE>
"""
args = locals()
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index ff4c7fcc8f1..62eaa3eb1ca 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -2775,11 +2775,10 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
Examples
--------
>>> psser = ps.Series([2, 1, 3, 3], name='A')
- >>> psser.unique().sort_values() # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
- <BLANKLINE>
- ... 1
- ... 2
- ... 3
+ >>> psser.unique().sort_values()
+ 1 1
+ 0 2
+ 2 3
Name: A, dtype: int64
>>> ps.Series([pd.Timestamp('2016-01-01') for _ in range(3)]).unique()
@@ -2787,11 +2786,10 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
dtype: datetime64[ns]
>>> psser.name = ('x', 'a')
- >>> psser.unique().sort_values() # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
- <BLANKLINE>
- ... 1
- ... 2
- ... 3
+ >>> psser.unique().sort_values()
+ 1 1
+ 0 2
+ 2 3
Name: (x, a), dtype: int64
"""
sdf = self._internal.spark_frame.select(self.spark.column).distinct()
@@ -4718,21 +4716,19 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
13 NaN
dtype: float64
- >>> s.mode().sort_values() # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
- <BLANKLINE>
- ... 1.0
- ... 2.0
- ... 3.0
+ >>> s.mode().sort_values()
+ 0 1.0
+ 1 2.0
+ 2 3.0
dtype: float64
With 'dropna' set to 'False', we can also see NaN in the result
- >>> s.mode(False).sort_values() # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
- <BLANKLINE>
- ... 1.0
- ... 2.0
- ... 3.0
- ... NaN
+ >>> s.mode(False).sort_values()
+ 0 1.0
+ 1 2.0
+ 2 3.0
+ 3 NaN
dtype: float64
"""
ser_count = self.value_counts(dropna=dropna, sort=False)
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 8ab3ed35578..565d3304596 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -387,7 +387,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
root
|-- age: integer (nullable = true)
|-- name: string (nullable = true)
- <BLANKLINE>
"""
print(self._jdf.schema().treeString())
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org