You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2022/07/25 03:51:37 UTC

[spark] branch branch-3.0 updated: [SPARK-39856][SQL][TESTS] Increase the number of partitions in TPC-DS build to avoid out-of-memory

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 0a27d0c6e8e [SPARK-39856][SQL][TESTS] Increase the number of partitions in TPC-DS build to avoid out-of-memory
0a27d0c6e8e is described below

commit 0a27d0c6e8e705176f0f245794bc8361860ac680
Author: Hyukjin Kwon <gu...@apache.org>
AuthorDate: Mon Jul 25 12:44:54 2022 +0900

    [SPARK-39856][SQL][TESTS] Increase the number of partitions in TPC-DS build to avoid out-of-memory
    
    This PR proposes to avoid out-of-memory in TPC-DS build at GitHub Actions CI by:
    
    - Increasing the number of partitions being used in shuffle.
    - Truncating precisions after 10th in floats.
        The number of partitions was previously set to 1 because of different results in precisions that generally we can just ignore.
    - Sort the results regardless of join type since Apache Spark does not guarantee the order of results
    
    One of the reasons for the large memory usage seems to be single partition that's being used in the shuffle.
    
    No, test-only.
    
    GitHub Actions in this CI will test it out.
    
    Closes #37270 from HyukjinKwon/deflake-tpcds.
    
    Authored-by: Hyukjin Kwon <gu...@apache.org>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
    (cherry picked from commit 7358253755762f9bfe6cedc1a50ec14616cfeace)
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 .../test/scala/org/apache/spark/sql/TPCDSQueryTestSuite.scala    | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQueryTestSuite.scala
index c16bcd9fd05..7fb4b567b1a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQueryTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQueryTestSuite.scala
@@ -58,7 +58,7 @@ class TPCDSQueryTestSuite extends QueryTest with TPCDSBase with SQLQueryTestHelp
 
   // To make output results deterministic
   protected override def sparkConf: SparkConf = super.sparkConf
-    .set(SQLConf.SHUFFLE_PARTITIONS.key, "1")
+    .set(SQLConf.SHUFFLE_PARTITIONS.key, 4.toString)
 
   protected override def createSparkSession: TestSparkSession = {
     new TestSparkSession(new SparkContext("local[1]", this.getClass.getSimpleName, sparkConf))
@@ -103,7 +103,9 @@ class TPCDSQueryTestSuite extends QueryTest with TPCDSBase with SQLQueryTestHelp
   private def runQuery(query: String, goldenFile: File): Unit = {
     val (schema, output) = handleExceptions(getNormalizedResult(spark, query))
     val queryString = query.trim
-    val outputString = output.mkString("\n").replaceAll("\\s+$", "")
+    val outputString = output.mkString("\n")
+      .replaceAll("\\s+$", "")
+      .replaceAll("""([0-9]+.[0-9]{10})([0-9]*)""", "$1")
     if (regenerateGoldenFiles) {
       val goldenOutput = {
         s"-- Automatically generated by ${getClass.getSimpleName}\n\n" +
@@ -130,7 +132,8 @@ class TPCDSQueryTestSuite extends QueryTest with TPCDSBase with SQLQueryTestHelp
         s"Expected 3 blocks in result file but got ${segments.size}. " +
           "Try regenerate the result files.")
 
-      (segments(1).trim, segments(2).replaceAll("\\s+$", ""))
+      (segments(1).trim, segments(2)
+        .replaceAll("\\s+$", "").replaceAll("""([0-9]+.[0-9]{10})([0-9]*)""", "$1"))
     }
 
     assertResult(expectedSchema, s"Schema did not match\n$queryString") { schema }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org