You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/04/14 10:36:02 UTC

[spark] branch branch-3.1 updated: [SPARK-35002][YARN][TESTS][FOLLOW-UP] Fix java.net.BindException in MiniYARNCluster

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new 23e3626  [SPARK-35002][YARN][TESTS][FOLLOW-UP] Fix java.net.BindException in MiniYARNCluster
23e3626 is described below

commit 23e36266213edf736c6eb049e153dfe2e11728fb
Author: HyukjinKwon <gu...@apache.org>
AuthorDate: Wed Apr 14 17:13:48 2021 +0800

    [SPARK-35002][YARN][TESTS][FOLLOW-UP] Fix java.net.BindException in MiniYARNCluster
    
    This PR fixes two tests below:
    
    https://github.com/apache/spark/runs/2320161984
    
    ```
    [info] YarnShuffleIntegrationSuite:
    [info] org.apache.spark.deploy.yarn.YarnShuffleIntegrationSuite *** ABORTED *** (228 milliseconds)
    [info]   org.apache.hadoop.yarn.exceptions.YarnRuntimeException: org.apache.hadoop.yarn.webapp.WebAppException: Error starting http server
    [info]   at org.apache.hadoop.yarn.server.MiniYARNCluster.startResourceManager(MiniYARNCluster.java:373)
    [info]   at org.apache.hadoop.yarn.server.MiniYARNCluster.access$300(MiniYARNCluster.java:128)
    [info]   at org.apache.hadoop.yarn.server.MiniYARNCluster$ResourceManagerWrapper.serviceStart(MiniYARNCluster.java:503)
    [info]   at org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)
    [info]   at org.apache.hadoop.service.CompositeService.serviceStart(CompositeService.java:121)
    [info]   at org.apache.hadoop.yarn.server.MiniYARNCluster.serviceStart(MiniYARNCluster.java:322)
    [info]   at org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)
    [info]   at org.apache.spark.deploy.yarn.BaseYarnClusterSuite.beforeAll(BaseYarnClusterSuite.scala:95)
    ...
    [info]   Cause: java.net.BindException: Port in use: fv-az186-831:0
    [info]   at org.apache.hadoop.http.HttpServer2.constructBindException(HttpServer2.java:1231)
    [info]   at org.apache.hadoop.http.HttpServer2.bindForSinglePort(HttpServer2.java:1253)
    [info]   at org.apache.hadoop.http.HttpServer2.openListeners(HttpServer2.java:1316)
    [info]   at org.apache.hadoop.http.HttpServer2.start(HttpServer2.java:1167)
    [info]   at org.apache.hadoop.yarn.webapp.WebApps$Builder.start(WebApps.java:449)
    [info]   at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.startWepApp(ResourceManager.java:1247)
    [info]   at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.serviceStart(ResourceManager.java:1356)
    [info]   at org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)
    [info]   at org.apache.hadoop.yarn.server.MiniYARNCluster.startResourceManager(MiniYARNCluster.java:365)
    [info]   at org.apache.hadoop.yarn.server.MiniYARNCluster.access$300(MiniYARNCluster.java:128)
    [info]   at org.apache.hadoop.yarn.server.MiniYARNCluster$ResourceManagerWrapper.serviceStart(MiniYARNCluster.java:503)
    [info]   at org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)
    [info]   at org.apache.hadoop.service.CompositeService.serviceStart(CompositeService.java:121)
    [info]   at org.apache.hadoop.yarn.server.MiniYARNCluster.serviceStart(MiniYARNCluster.java:322)
    [info]   at org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)
    [info]   at org.apache.spark.deploy.yarn.BaseYarnClusterSuite.beforeAll(BaseYarnClusterSuite.scala:95)
    [info]   at org.scalatest.BeforeAndAfterAll.liftedTree1$1(BeforeAndAfterAll.scala:212)
    [info]   at org.scalatest.BeforeAndAfterAll.run(BeforeAndAfterAll.scala:210)
    [info]   at org.scalatest.BeforeAndAfterAll.run$(BeforeAndAfterAll.scala:208)
    [info]   at org.apache.spark.SparkFunSuite.run(SparkFunSuite.scala:61)
    ...
    ```
    
    https://github.com/apache/spark/runs/2323342094
    
    ```
    [info] Test org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testBadSecret started
    [error] Test org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testBadSecret failed: java.lang.AssertionError: Connecting to /10.1.0.161:39895 timed out (120000 ms), took 120.081 sec
    [error]     at org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testBadSecret(ExternalShuffleSecuritySuite.java:85)
    [error]     ...
    [info] Test org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testBadAppId started
    [error] Test org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testBadAppId failed: java.lang.AssertionError: Connecting to /10.1.0.198:44633 timed out (120000 ms), took 120.08 sec
    [error]     at org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testBadAppId(ExternalShuffleSecuritySuite.java:76)
    [error]     ...
    [info] Test org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testValid started
    [error] Test org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testValid failed: java.io.IOException: Connecting to /10.1.0.119:43575 timed out (120000 ms), took 120.089 sec
    [error]     at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:285)
    [error]     at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:218)
    [error]     at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:230)
    [error]     at org.apache.spark.network.shuffle.ExternalBlockStoreClient.registerWithShuffleServer(ExternalBlockStoreClient.java:211)
    [error]     at org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.validate(ExternalShuffleSecuritySuite.java:108)
    [error]     at org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testValid(ExternalShuffleSecuritySuite.java:68)
    [error]     ...
    [info] Test org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testEncryption started
    [error] Test org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testEncryption failed: java.io.IOException: Connecting to /10.1.0.248:35271 timed out (120000 ms), took 120.014 sec
    [error]     at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:285)
    [error]     at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:218)
    [error]     at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:230)
    [error]     at org.apache.spark.network.shuffle.ExternalBlockStoreClient.registerWithShuffleServer(ExternalBlockStoreClient.java:211)
    [error]     at org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.validate(ExternalShuffleSecuritySuite.java:108)
    [error]     at org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testEncryption(ExternalShu
    ```
    
    For Yarn cluster suites, its difficult to fix. This PR makes it skipped if it fails to bind.
    For shuffle related suites, it uses local host
    
    To make the tests stable
    
    No, dev-only.
    
    Its tested in GitHub Actions: https://github.com/HyukjinKwon/spark/runs/2340210765
    
    Closes #32126 from HyukjinKwon/SPARK-35002-followup.
    
    Authored-by: HyukjinKwon <gu...@apache.org>
    Signed-off-by: Yuming Wang <yu...@ebay.com>
    (cherry picked from commit a153efa643dcb1d8e6c2242846b3db0b2be39ae7)
    Signed-off-by: HyukjinKwon <gu...@apache.org>
---
 .../java/org/apache/spark/network/TestUtils.java   |  4 +++-
 .../spark/deploy/yarn/BaseYarnClusterSuite.scala   | 27 ++++++++++++++++++----
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/common/network-common/src/test/java/org/apache/spark/network/TestUtils.java b/common/network-common/src/test/java/org/apache/spark/network/TestUtils.java
index 56a2b80..c2c5ffa 100644
--- a/common/network-common/src/test/java/org/apache/spark/network/TestUtils.java
+++ b/common/network-common/src/test/java/org/apache/spark/network/TestUtils.java
@@ -22,7 +22,9 @@ import java.net.InetAddress;
 public class TestUtils {
   public static String getLocalHost() {
     try {
-      return InetAddress.getLocalHost().getHostAddress();
+      return (System.getenv().containsKey("SPARK_LOCAL_IP"))?
+        System.getenv("SPARK_LOCAL_IP"):
+          InetAddress.getLocalHost().getHostAddress();
     } catch (Exception e) {
       throw new RuntimeException(e);
     }
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
index 20f5339..2542b45 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
@@ -28,7 +28,8 @@ import scala.concurrent.duration._
 import com.google.common.io.Files
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.hadoop.yarn.server.MiniYARNCluster
-import org.scalatest.BeforeAndAfterAll
+import org.scalactic.source.Position
+import org.scalatest.{BeforeAndAfterAll, Tag}
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.matchers.must.Matchers
 
@@ -41,6 +42,7 @@ import org.apache.spark.util.Utils
 
 abstract class BaseYarnClusterSuite
   extends SparkFunSuite with BeforeAndAfterAll with Matchers with Logging {
+  private var isBindSuccessful = true
 
   // log4j configuration for the YARN containers, so that their output is collected
   // by YARN instead of trying to overwrite unit-tests.log.
@@ -64,6 +66,14 @@ abstract class BaseYarnClusterSuite
 
   def newYarnConfig(): YarnConfiguration
 
+  override protected def test(testName: String, testTags: Tag*)(testFun: => Any)
+                             (implicit pos: Position): Unit = {
+    super.test(testName, testTags: _*) {
+      assume(isBindSuccessful, "Mini Yarn cluster should be able to bind.")
+      testFun
+    }
+  }
+
   override def beforeAll(): Unit = {
     super.beforeAll()
 
@@ -80,9 +90,16 @@ abstract class BaseYarnClusterSuite
     yarnConf.set("yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage",
       "100.0")
 
-    yarnCluster = new MiniYARNCluster(getClass().getName(), 1, 1, 1)
-    yarnCluster.init(yarnConf)
-    yarnCluster.start()
+    try {
+      yarnCluster = new MiniYARNCluster(getClass().getName(), 1, 1, 1)
+      yarnCluster.init(yarnConf)
+      yarnCluster.start()
+    } catch {
+      case e: Throwable if org.apache.commons.lang3.exception.ExceptionUtils.indexOfThrowable(
+          e, classOf[java.net.BindException]) != -1 =>
+        isBindSuccessful = false
+        return
+    }
 
     // There's a race in MiniYARNCluster in which start() may return before the RM has updated
     // its address in the configuration. You can see this in the logs by noticing that when
@@ -118,7 +135,7 @@ abstract class BaseYarnClusterSuite
 
   override def afterAll(): Unit = {
     try {
-      yarnCluster.stop()
+      if (yarnCluster != null) yarnCluster.stop()
     } finally {
       super.afterAll()
     }

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org