You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2020/02/11 01:05:37 UTC

[spark] branch branch-3.0 updated: [SPARK-30777][PYTHON][TESTS] Fix test failures for Pandas >= 1.0.0

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new b2b7cca  [SPARK-30777][PYTHON][TESTS] Fix test failures for Pandas >= 1.0.0
b2b7cca is described below

commit b2b7cca6dec575b578f093bc7caa80f1b9d7b170
Author: Bryan Cutler <cu...@gmail.com>
AuthorDate: Tue Feb 11 10:03:01 2020 +0900

    [SPARK-30777][PYTHON][TESTS] Fix test failures for Pandas >= 1.0.0
    
    ### What changes were proposed in this pull request?
    
    Fix PySpark test failures for using Pandas >= 1.0.0.
    
    ### Why are the changes needed?
    
    Pandas 1.0.0 has recently been released and has API changes that result in PySpark test failures, this PR fixes the broken tests.
    
    ### Does this PR introduce any user-facing change?
    
    No
    
    ### How was this patch tested?
    
    Manually tested with Pandas 1.0.1 and PyArrow 0.16.0
    
    Closes #27529 from BryanCutler/pandas-fix-tests-1.0-SPARK-30777.
    
    Authored-by: Bryan Cutler <cu...@gmail.com>
    Signed-off-by: HyukjinKwon <gu...@apache.org>
    (cherry picked from commit 07a9885f2792be1353f4a923d649e90bc431cb38)
    Signed-off-by: HyukjinKwon <gu...@apache.org>
---
 python/pyspark/sql/tests/test_arrow.py                  | 4 ++--
 python/pyspark/sql/tests/test_pandas_grouped_map.py     | 6 +++---
 python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py
index 98f44df..004c79f 100644
--- a/python/pyspark/sql/tests/test_arrow.py
+++ b/python/pyspark/sql/tests/test_arrow.py
@@ -297,9 +297,9 @@ class ArrowTests(ReusedSQLTestCase):
         # Some series get converted for Spark to consume, this makes sure input is unchanged
         pdf = self.create_pandas_data_frame()
         # Use a nanosecond value to make sure it is not truncated
-        pdf.ix[0, '8_timestamp_t'] = pd.Timestamp(1)
+        pdf.iloc[0, 7] = pd.Timestamp(1)
         # Integers with nulls will get NaNs filled with 0 and will be casted
-        pdf.ix[1, '2_int_t'] = None
+        pdf.iloc[1, 1] = None
         pdf_copy = pdf.copy(deep=True)
         self.spark.createDataFrame(pdf, schema=self.schema)
         self.assertTrue(pdf.equals(pdf_copy))
diff --git a/python/pyspark/sql/tests/test_pandas_grouped_map.py b/python/pyspark/sql/tests/test_pandas_grouped_map.py
index 51dd07f..ff53a0c 100644
--- a/python/pyspark/sql/tests/test_pandas_grouped_map.py
+++ b/python/pyspark/sql/tests/test_pandas_grouped_map.py
@@ -390,11 +390,11 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
         # Function returns a pdf with required column names, but order could be arbitrary using dict
         def change_col_order(pdf):
             # Constructing a DataFrame from a dict should result in the same order,
-            # but use from_items to ensure the pdf column order is different than schema
-            return pd.DataFrame.from_items([
+            # but use OrderedDict to ensure the pdf column order is different than schema
+            return pd.DataFrame.from_dict(OrderedDict([
                 ('id', pdf.id),
                 ('u', pdf.v * 2),
-                ('v', pdf.v)])
+                ('v', pdf.v)]))
 
         ordered_udf = pandas_udf(
             change_col_order,
diff --git a/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py b/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py
index 974ad56..2167978 100644
--- a/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py
+++ b/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py
@@ -357,7 +357,7 @@ class GroupedAggPandasUDFTests(ReusedSQLTestCase):
                         plus_one(sum_udf(col('v1'))),
                         sum_udf(plus_one(col('v2'))))
                    .sort(['id', '(v % 2)'])
-                   .toPandas().sort_index(by=['id', '(v % 2)']))
+                   .toPandas().sort_values(by=['id', '(v % 2)']))
 
         expected1 = (df.withColumn('v1', df.v + 1)
                      .withColumn('v2', df.v + 2)
@@ -368,7 +368,7 @@ class GroupedAggPandasUDFTests(ReusedSQLTestCase):
                           plus_one(sum(col('v1'))),
                           sum(plus_one(col('v2'))))
                      .sort(['id', '(v % 2)'])
-                     .toPandas().sort_index(by=['id', '(v % 2)']))
+                     .toPandas().sort_values(by=['id', '(v % 2)']))
 
         # Test complex expressions with sql expression, scala pandas UDF and
         # group aggregate pandas UDF
@@ -381,7 +381,7 @@ class GroupedAggPandasUDFTests(ReusedSQLTestCase):
                         plus_two(sum_udf(col('v1'))),
                         sum_udf(plus_two(col('v2'))))
                    .sort(['id', '(v % 2)'])
-                   .toPandas().sort_index(by=['id', '(v % 2)']))
+                   .toPandas().sort_values(by=['id', '(v % 2)']))
 
         expected2 = (df.withColumn('v1', df.v + 1)
                      .withColumn('v2', df.v + 2)
@@ -392,7 +392,7 @@ class GroupedAggPandasUDFTests(ReusedSQLTestCase):
                           plus_two(sum(col('v1'))),
                           sum(plus_two(col('v2'))))
                      .sort(['id', '(v % 2)'])
-                     .toPandas().sort_index(by=['id', '(v % 2)']))
+                     .toPandas().sort_values(by=['id', '(v % 2)']))
 
         # Test sequential groupby aggregate
         result3 = (df.groupby('id')


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org