You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by "stefanbuk-db (via GitHub)" <gi...@apache.org> on 2024/03/14 12:35:08 UTC

[PR] [SPARK-47379][TESTS] Improve docker JDBC suite test reliability [spark]

stefanbuk-db opened a new pull request, #45518:
URL: https://github.com/apache/spark/pull/45518

   ### What changes were proposed in this pull request?
   In this PR I propose changes in Docker JDBC suite with helper function for test retry and better error handling.
   
   
   ### Why are the changes needed?
   This changes improve general reliability of docker tests that fail at certain workloads.
   
   
   ### Does this PR introduce _any_ user-facing change?
   No
   
   
   ### How was this patch tested?
   This is patch for tests.
   
   
   ### Was this patch authored or co-authored using generative AI tooling?
   No
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


Re: [PR] [SPARK-47379][TESTS] Improve docker JDBC suite test reliability [spark]

Posted by "yaooqinn (via GitHub)" <gi...@apache.org>.
yaooqinn commented on code in PR #45518:
URL: https://github.com/apache/spark/pull/45518#discussion_r1524847980


##########
connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerJDBCIntegrationSuite.scala:
##########
@@ -129,104 +130,129 @@ abstract class DockerJDBCIntegrationSuite
   private var pulled: Boolean = false
   protected var jdbcUrl: String = _
 
+  @tailrec private def retry_helper[T](n: Int)(body: => T): T = {
+    try body
+    catch {
+      case e: Throwable =>
+        if (n > 0) {
+          logWarning(e.getMessage, e)
+          logInfo(s"\n\n===== RETRYING =====\n")
+          retry_helper(n - 1)(body)
+        }
+        else throw e
+    }
+  }
+
   override def beforeAll(): Unit = runIfTestsEnabled(s"Prepare for ${this.getClass.getName}") {
-    super.beforeAll()
-    try {
-      val config = DefaultDockerClientConfig.createDefaultConfigBuilder.build
-      val httpClient = new ZerodepDockerHttpClient.Builder()
-        .dockerHost(config.getDockerHost)
-        .sslConfig(config.getSSLConfig)
-        .build()
-      docker = DockerClientImpl.getInstance(config, httpClient)
-      // Check that Docker is actually up
+    retry_helper(5) {
+      super.beforeAll()
       try {
-        docker.pingCmd().exec()
-      } catch {
-        case NonFatal(e) =>
-          log.error("Exception while connecting to Docker. Check whether Docker is running.")
-          throw e
-      }
-      try {
-        // Ensure that the Docker image is installed:
-        docker.inspectImageCmd(db.imageName).exec()
-      } catch {
-        case e: NotFoundException =>
-          log.warn(s"Docker image ${db.imageName} not found; pulling image from registry")
-          val callback = new PullImageResultCallback {
-            override def onNext(item: PullResponseItem): Unit = {
-              super.onNext(item)
-              val status = item.getStatus
-              if (status != null && status != "Downloading" && status != "Extracting") {
-                logInfo(s"$status ${item.getId}")
+        val config = DefaultDockerClientConfig.createDefaultConfigBuilder.build
+        val httpClient = new ZerodepDockerHttpClient.Builder()
+          .dockerHost(config.getDockerHost)
+          .sslConfig(config.getSSLConfig)
+          .build()
+        docker = DockerClientImpl.getInstance(config, httpClient)
+        // Check that Docker is actually up
+        try {
+          docker.pingCmd().exec()
+        } catch {
+          case NonFatal(e) =>
+            log.error("Exception while connecting to Docker. Check whether Docker is running.")
+            throw e
+        }
+        try {
+          // Ensure that the Docker image is installed:
+          docker.inspectImageCmd(db.imageName).exec()
+        } catch {
+          case e: NotFoundException =>
+            log.warn(s"Docker image ${db.imageName} not found; pulling image from registry")
+            val callback = new PullImageResultCallback {
+              override def onNext(item: PullResponseItem): Unit = {
+                super.onNext(item)
+                if (item.getStatus != null) {
+                  item.getStatus match {
+                    case s if item.getProgressDetail != null &&
+                      item.getProgressDetail.getCurrent != null &&
+                      item.getProgressDetail.getCurrent == item.getProgressDetail.getTotal =>
+                      // logging for final progress procedural status
+                      logInfo(s"$s ${item.getId} ${bytesToString(item.getProgressDetail.getTotal)}")
+                    case s if s != "Downloading" && s != "Extracting" =>
+                      logInfo(s"${item.getStatus} ${item.getId}")
+                    case _ =>
+                  }
+                }
               }
-            }
-          }
 
-          val (success, time) = Utils.timeTakenMs(
+              override def onComplete(): Unit = {
+                pulled = true
+              }
+
+              override def onError(throwable: Throwable): Unit = {
+                logError(s"Failed to pull Docker image ${db.imageName}", throwable)
+              }
+            }
             docker.pullImageCmd(db.imageName)
               .exec(callback)
-              .awaitCompletion(imagePullTimeout, TimeUnit.SECONDS))
-
-          if (success) {
-            pulled = success
-            logInfo(s"Successfully pulled image ${db.imageName} in $time ms")
-          } else {
-            throw new TimeoutException(
-              s"Timeout('$imagePullTimeout secs') waiting for image ${db.imageName} to be pulled")
-          }
-      }
+              .awaitCompletion(imagePullTimeout, TimeUnit.SECONDS)
+            if (!pulled) {
+              throw new TimeoutException(
+                s"Timeout('$imagePullTimeout secs') waiting for image ${db.imageName} to be pulled")
+            }
+        }
 
-      val hostConfig = HostConfig
-        .newHostConfig()
-        .withNetworkMode("bridge")
-        .withPrivileged(db.privileged)
-        .withPortBindings(PortBinding.parse(s"$externalPort:${db.jdbcPort}"))
+        val hostConfig = HostConfig
+          .newHostConfig()
+          .withNetworkMode("bridge")
+          .withPrivileged(db.privileged)
+          .withPortBindings(PortBinding.parse(s"$externalPort:${db.jdbcPort}"))
 
-      if (db.usesIpc) {
-        hostConfig.withIpcMode("host")
-      }
+        if (db.usesIpc) {
+          hostConfig.withIpcMode("host")
+        }
 
-      val containerConfig = new ContainerConfig()
+        val containerConfig = new ContainerConfig()
 
-      db.beforeContainerStart(hostConfig, containerConfig)
+        db.beforeContainerStart(hostConfig, containerConfig)
 
-      // Create the database container:
-      val createContainerCmd = docker.createContainerCmd(db.imageName)
-        .withHostConfig(hostConfig)
-        .withExposedPorts(ExposedPort.tcp(db.jdbcPort))
-        .withEnv(db.env.map { case (k, v) => s"$k=$v" }.toList.asJava)
-        .withNetworkDisabled(false)
+        // Create the database container:
+        val createContainerCmd = docker.createContainerCmd(db.imageName)
+          .withHostConfig(hostConfig)
+          .withExposedPorts(ExposedPort.tcp(db.jdbcPort))
+          .withEnv(db.env.map { case (k, v) => s"$k=$v" }.toList.asJava)
+          .withNetworkDisabled(false)
 
 
-      db.getEntryPoint.foreach(ep => createContainerCmd.withEntrypoint(ep))
-      db.getStartupProcessName.foreach(n => createContainerCmd.withCmd(n))
+        db.getEntryPoint.foreach(ep => createContainerCmd.withEntrypoint(ep))
+        db.getStartupProcessName.foreach(n => createContainerCmd.withCmd(n))
 
-      container = createContainerCmd.exec()
-      // Start the container and wait until the database can accept JDBC connections:
-      docker.startContainerCmd(container.getId).exec()
-      eventually(timeout(startContainerTimeout.seconds), interval(1.second)) {
-        val response = docker.inspectContainerCmd(container.getId).exec()
-        assert(response.getState.getRunning)
-      }
-      jdbcUrl = db.getJdbcUrl(dockerIp, externalPort)
-      var conn: Connection = null
-      eventually(connectionTimeout, interval(1.second)) {
-        conn = getConnection()
-      }
-      // Run any setup queries:
-      try {
-        dataPreparation(conn)
-      } finally {
-        conn.close()
-      }
-    } catch {
-      case NonFatal(e) =>
-        logError(s"Failed to initialize Docker container for ${this.getClass.getName}", e)
+        container = createContainerCmd.exec()
+        // Start the container and wait until the database can accept JDBC connections:
+        docker.startContainerCmd(container.getId).exec()

Review Comment:
   Due to limited GHA resources, it is not advisable to repeatedly start containers.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


Re: [PR] [SPARK-47379][TESTS] Improve docker JDBC suite test reliability [spark]

Posted by "yaooqinn (via GitHub)" <gi...@apache.org>.
yaooqinn commented on PR #45518:
URL: https://github.com/apache/spark/pull/45518#issuecomment-1997433253

   If this is mainly about OracleIntegrationSuite failures, I suggest we use SYS instead of SYSTEM as the connection user.
   
   Based on my observation, an Oracle db container fails to be connected for a consistent reason of `ALTER USER SYSTEM` faluire https://github.com/gvenzl/oci-oracle-free/issues/35#issuecomment-1978043928


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org