You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2020/02/11 01:05:37 UTC
[spark] branch branch-3.0 updated: [SPARK-30777][PYTHON][TESTS] Fix
test failures for Pandas >= 1.0.0
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new b2b7cca [SPARK-30777][PYTHON][TESTS] Fix test failures for Pandas >= 1.0.0
b2b7cca is described below
commit b2b7cca6dec575b578f093bc7caa80f1b9d7b170
Author: Bryan Cutler <cu...@gmail.com>
AuthorDate: Tue Feb 11 10:03:01 2020 +0900
[SPARK-30777][PYTHON][TESTS] Fix test failures for Pandas >= 1.0.0
### What changes were proposed in this pull request?
Fix PySpark test failures for using Pandas >= 1.0.0.
### Why are the changes needed?
Pandas 1.0.0 has recently been released and has API changes that result in PySpark test failures, this PR fixes the broken tests.
### Does this PR introduce any user-facing change?
No
### How was this patch tested?
Manually tested with Pandas 1.0.1 and PyArrow 0.16.0
Closes #27529 from BryanCutler/pandas-fix-tests-1.0-SPARK-30777.
Authored-by: Bryan Cutler <cu...@gmail.com>
Signed-off-by: HyukjinKwon <gu...@apache.org>
(cherry picked from commit 07a9885f2792be1353f4a923d649e90bc431cb38)
Signed-off-by: HyukjinKwon <gu...@apache.org>
---
python/pyspark/sql/tests/test_arrow.py | 4 ++--
python/pyspark/sql/tests/test_pandas_grouped_map.py | 6 +++---
python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py | 8 ++++----
3 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py
index 98f44df..004c79f 100644
--- a/python/pyspark/sql/tests/test_arrow.py
+++ b/python/pyspark/sql/tests/test_arrow.py
@@ -297,9 +297,9 @@ class ArrowTests(ReusedSQLTestCase):
# Some series get converted for Spark to consume, this makes sure input is unchanged
pdf = self.create_pandas_data_frame()
# Use a nanosecond value to make sure it is not truncated
- pdf.ix[0, '8_timestamp_t'] = pd.Timestamp(1)
+ pdf.iloc[0, 7] = pd.Timestamp(1)
# Integers with nulls will get NaNs filled with 0 and will be casted
- pdf.ix[1, '2_int_t'] = None
+ pdf.iloc[1, 1] = None
pdf_copy = pdf.copy(deep=True)
self.spark.createDataFrame(pdf, schema=self.schema)
self.assertTrue(pdf.equals(pdf_copy))
diff --git a/python/pyspark/sql/tests/test_pandas_grouped_map.py b/python/pyspark/sql/tests/test_pandas_grouped_map.py
index 51dd07f..ff53a0c 100644
--- a/python/pyspark/sql/tests/test_pandas_grouped_map.py
+++ b/python/pyspark/sql/tests/test_pandas_grouped_map.py
@@ -390,11 +390,11 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
# Function returns a pdf with required column names, but order could be arbitrary using dict
def change_col_order(pdf):
# Constructing a DataFrame from a dict should result in the same order,
- # but use from_items to ensure the pdf column order is different than schema
- return pd.DataFrame.from_items([
+ # but use OrderedDict to ensure the pdf column order is different than schema
+ return pd.DataFrame.from_dict(OrderedDict([
('id', pdf.id),
('u', pdf.v * 2),
- ('v', pdf.v)])
+ ('v', pdf.v)]))
ordered_udf = pandas_udf(
change_col_order,
diff --git a/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py b/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py
index 974ad56..2167978 100644
--- a/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py
+++ b/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py
@@ -357,7 +357,7 @@ class GroupedAggPandasUDFTests(ReusedSQLTestCase):
plus_one(sum_udf(col('v1'))),
sum_udf(plus_one(col('v2'))))
.sort(['id', '(v % 2)'])
- .toPandas().sort_index(by=['id', '(v % 2)']))
+ .toPandas().sort_values(by=['id', '(v % 2)']))
expected1 = (df.withColumn('v1', df.v + 1)
.withColumn('v2', df.v + 2)
@@ -368,7 +368,7 @@ class GroupedAggPandasUDFTests(ReusedSQLTestCase):
plus_one(sum(col('v1'))),
sum(plus_one(col('v2'))))
.sort(['id', '(v % 2)'])
- .toPandas().sort_index(by=['id', '(v % 2)']))
+ .toPandas().sort_values(by=['id', '(v % 2)']))
# Test complex expressions with sql expression, scala pandas UDF and
# group aggregate pandas UDF
@@ -381,7 +381,7 @@ class GroupedAggPandasUDFTests(ReusedSQLTestCase):
plus_two(sum_udf(col('v1'))),
sum_udf(plus_two(col('v2'))))
.sort(['id', '(v % 2)'])
- .toPandas().sort_index(by=['id', '(v % 2)']))
+ .toPandas().sort_values(by=['id', '(v % 2)']))
expected2 = (df.withColumn('v1', df.v + 1)
.withColumn('v2', df.v + 2)
@@ -392,7 +392,7 @@ class GroupedAggPandasUDFTests(ReusedSQLTestCase):
plus_two(sum(col('v1'))),
sum(plus_two(col('v2'))))
.sort(['id', '(v % 2)'])
- .toPandas().sort_index(by=['id', '(v % 2)']))
+ .toPandas().sort_values(by=['id', '(v % 2)']))
# Test sequential groupby aggregate
result3 = (df.groupby('id')
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org