You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2020/07/15 14:53:50 UTC

[spark] branch branch-2.4 updated: [SPARK-32318][SQL][TESTS] Add a test case to EliminateSortsSuite for ORDER BY in DISTRIBUTE BY

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-2.4 by this push:
     new 9aeeb0f  [SPARK-32318][SQL][TESTS] Add a test case to EliminateSortsSuite for ORDER BY in DISTRIBUTE BY
9aeeb0f is described below

commit 9aeeb0f5932550c8025b6804235a50fc203da3a1
Author: Dongjoon Hyun <do...@apache.org>
AuthorDate: Wed Jul 15 07:43:56 2020 -0700

    [SPARK-32318][SQL][TESTS] Add a test case to EliminateSortsSuite for ORDER BY in DISTRIBUTE BY
    
    This PR aims to add a test case to EliminateSortsSuite to protect a valid use case which is using ORDER BY in DISTRIBUTE BY statement.
    
    ```scala
    scala> scala.util.Random.shuffle((1 to 100000).map(x => (x % 2, x))).toDF("a", "b").repartition(2).createOrReplaceTempView("t")
    
    scala> sql("select * from (select * from t order by b) distribute by a").write.orc("/tmp/master")
    
    $ ls -al /tmp/master/
    total 56
    drwxr-xr-x  10 dongjoon  wheel  320 Jul 14 22:12 ./
    drwxrwxrwt  15 root      wheel  480 Jul 14 22:12 ../
    -rw-r--r--   1 dongjoon  wheel    8 Jul 14 22:12 ._SUCCESS.crc
    -rw-r--r--   1 dongjoon  wheel   12 Jul 14 22:12 .part-00000-2cd3a50e-eded-49a4-b7cf-94e3f090b8c1-c000.snappy.orc.crc
    -rw-r--r--   1 dongjoon  wheel   16 Jul 14 22:12 .part-00043-2cd3a50e-eded-49a4-b7cf-94e3f090b8c1-c000.snappy.orc.crc
    -rw-r--r--   1 dongjoon  wheel   16 Jul 14 22:12 .part-00191-2cd3a50e-eded-49a4-b7cf-94e3f090b8c1-c000.snappy.orc.crc
    -rw-r--r--   1 dongjoon  wheel    0 Jul 14 22:12 _SUCCESS
    -rw-r--r--   1 dongjoon  wheel  119 Jul 14 22:12 part-00000-2cd3a50e-eded-49a4-b7cf-94e3f090b8c1-c000.snappy.orc
    -rw-r--r--   1 dongjoon  wheel  932 Jul 14 22:12 part-00043-2cd3a50e-eded-49a4-b7cf-94e3f090b8c1-c000.snappy.orc
    -rw-r--r--   1 dongjoon  wheel  939 Jul 14 22:12 part-00191-2cd3a50e-eded-49a4-b7cf-94e3f090b8c1-c000.snappy.orc
    ```
    
    The following was found during SPARK-32276. If Spark optimizer removes the inner `ORDER BY`, the file size increases.
    ```scala
    scala> scala.util.Random.shuffle((1 to 100000).map(x => (x % 2, x))).toDF("a", "b").repartition(2).createOrReplaceTempView("t")
    
    scala> sql("select * from (select * from t order by b) distribute by a").write.orc("/tmp/SPARK-32276")
    
    $ ls -al /tmp/SPARK-32276/
    total 632
    drwxr-xr-x  10 dongjoon  wheel     320 Jul 14 22:08 ./
    drwxrwxrwt  14 root      wheel     448 Jul 14 22:08 ../
    -rw-r--r--   1 dongjoon  wheel       8 Jul 14 22:08 ._SUCCESS.crc
    -rw-r--r--   1 dongjoon  wheel      12 Jul 14 22:08 .part-00000-ba5049f9-b835-49b7-9fdb-bdd11b9891cb-c000.snappy.orc.crc
    -rw-r--r--   1 dongjoon  wheel    1188 Jul 14 22:08 .part-00043-ba5049f9-b835-49b7-9fdb-bdd11b9891cb-c000.snappy.orc.crc
    -rw-r--r--   1 dongjoon  wheel    1188 Jul 14 22:08 .part-00191-ba5049f9-b835-49b7-9fdb-bdd11b9891cb-c000.snappy.orc.crc
    -rw-r--r--   1 dongjoon  wheel       0 Jul 14 22:08 _SUCCESS
    -rw-r--r--   1 dongjoon  wheel     119 Jul 14 22:08 part-00000-ba5049f9-b835-49b7-9fdb-bdd11b9891cb-c000.snappy.orc
    -rw-r--r--   1 dongjoon  wheel  150735 Jul 14 22:08 part-00043-ba5049f9-b835-49b7-9fdb-bdd11b9891cb-c000.snappy.orc
    -rw-r--r--   1 dongjoon  wheel  150741 Jul 14 22:08 part-00191-ba5049f9-b835-49b7-9fdb-bdd11b9891cb-c000.snappy.orc
    ```
    
    No. This only improves the test coverage.
    
    Pass the GitHub Action or Jenkins.
    
    Closes #29118 from dongjoon-hyun/SPARK-32318.
    
    Authored-by: Dongjoon Hyun <do...@apache.org>
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
    (cherry picked from commit 8950dcbb1cafccc2ba8bbf030ab7ac86cfe203a4)
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
 .../spark/sql/catalyst/optimizer/EliminateSortsSuite.scala       | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSortsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSortsSuite.scala
index e318f36..5d4f99a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSortsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSortsSuite.scala
@@ -83,4 +83,13 @@ class EliminateSortsSuite extends PlanTest {
 
     comparePlans(optimized, correctAnswer)
   }
+
+  test("SPARK-32318: should not remove orderBy in distribute statement") {
+    val projectPlan = testRelation.select('a, 'b)
+    val orderByPlan = projectPlan.orderBy('b.desc)
+    val distributedPlan = orderByPlan.distribute('a)(1)
+    val optimized = Optimize.execute(distributedPlan.analyze)
+    val correctAnswer = distributedPlan.analyze
+    comparePlans(optimized, correctAnswer)
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org