You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kyuubi.apache.org by ch...@apache.org on 2022/11/16 11:30:03 UTC

[incubator-kyuubi] branch master updated: [KYUUBI #3807][Subtask][PySpark] Support get Python path from config and prefer PYSPARK_DRIVER_PYTHON env

This is an automated email from the ASF dual-hosted git repository.

chengpan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-kyuubi.git


The following commit(s) were added to refs/heads/master by this push:
     new bfd7f7b6b [KYUUBI #3807][Subtask][PySpark] Support get Python path from config and prefer PYSPARK_DRIVER_PYTHON env
bfd7f7b6b is described below

commit bfd7f7b6b18a896b4479b76b5286c89463ccf5bb
Author: liangbowen <li...@gf.com.cn>
AuthorDate: Wed Nov 16 19:29:53 2022 +0800

    [KYUUBI #3807][Subtask][PySpark] Support get Python path from config and prefer PYSPARK_DRIVER_PYTHON env
    
    ### _Why are the changes needed?_
    
    to close #3807
    
    1. Prefer the system env PYSPARK_DRIVER_PYTHON than PYSPARK_PYTHON for Python execute path, to fix the problem when submitting to YARN with client deploy mode.
    2. Support get python path from Spark config
    
    As the same order in Spark (<https://github.com/apache/spark/blob/v3.3.1/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java#L330>),
    
    ```
    // 1. conf spark.pyspark.driver.python
    // 2. conf spark.pyspark.python
    // 3. environment variable PYSPARK_DRIVER_PYTHON
    // 4. environment variable PYSPARK_PYTHON
    // 5. python
    ```
    
    ### _How was this patch tested?_
    - [ ] Add some test cases that check the changes thoroughly including negative and positive cases if possible
    
    - [ ] Add screenshots for manual tests if appropriate
    
    - [x] [Run test](https://kyuubi.apache.org/docs/latest/develop_tools/testing.html#running-tests) locally before make a pull request
    
    Closes #3808 from bowenliang123/3807-pysparkpython.
    
    Closes #3807
    
    567a81d6 [liangbowen] fix NoSuchElementException
    8a22a15a [liangbowen] nit
    adc25463 [liangbowen] Support get python path from Spark config. Prefer the system env PYSPARK_DRIVER_PYTHON than PYSPARK_PYTHON for Python execute path.
    
    Authored-by: liangbowen <li...@gf.com.cn>
    Signed-off-by: Cheng Pan <ch...@apache.org>
---
 .../kyuubi/engine/spark/operation/ExecutePython.scala     | 15 ++++++++++-----
 .../engine/spark/operation/SparkSQLOperationManager.scala |  2 +-
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/operation/ExecutePython.scala b/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/operation/ExecutePython.scala
index f980fda9a..cb3d8f842 100644
--- a/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/operation/ExecutePython.scala
+++ b/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/operation/ExecutePython.scala
@@ -26,8 +26,9 @@ import scala.collection.JavaConverters._
 
 import com.fasterxml.jackson.databind.ObjectMapper
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import org.apache.commons.lang3.StringUtils
 import org.apache.spark.api.python.KyuubiPythonGatewayServer
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.{Row, RuntimeConfig}
 import org.apache.spark.sql.types.StructType
 
 import org.apache.kyuubi.Logging
@@ -98,9 +99,6 @@ case class SessionPythonWorker(
 
 object ExecutePython extends Logging {
 
-  // TODO:(fchen) get from conf
-  val pythonExec =
-    sys.env.getOrElse("PYSPARK_PYTHON", sys.env.getOrElse("PYSPARK_DRIVER_PYTHON", "python3"))
   private val isPythonGatewayStart = new AtomicBoolean(false)
   val kyuubiPythonPath = Files.createTempDirectory("")
   def init(): Unit = {
@@ -116,7 +114,14 @@ object ExecutePython extends Logging {
     }
   }
 
-  def createSessionPythonWorker(): SessionPythonWorker = {
+  def createSessionPythonWorker(conf: RuntimeConfig): SessionPythonWorker = {
+    val pythonExec = StringUtils.firstNonBlank(
+      conf.getOption("spark.pyspark.driver.python").orNull,
+      conf.getOption("spark.pyspark.python").orNull,
+      System.getenv("PYSPARK_DRIVER_PYTHON"),
+      System.getenv("PYSPARK_PYTHON"),
+      "python3")
+
     val builder = new ProcessBuilder(Seq(
       pythonExec,
       s"${ExecutePython.kyuubiPythonPath}/execute_python.py").asJava)
diff --git a/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/operation/SparkSQLOperationManager.scala b/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/operation/SparkSQLOperationManager.scala
index 4166d4902..1baa44d5b 100644
--- a/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/operation/SparkSQLOperationManager.scala
+++ b/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/operation/SparkSQLOperationManager.scala
@@ -93,7 +93,7 @@ class SparkSQLOperationManager private (name: String) extends OperationManager(n
           ExecutePython.init()
           val worker = sessionToPythonProcess.getOrElseUpdate(
             session.handle,
-            ExecutePython.createSessionPythonWorker())
+            ExecutePython.createSessionPythonWorker(spark.conf))
           new ExecutePython(session, statement, worker)
         case OperationLanguages.UNKNOWN =>
           spark.conf.unset(OPERATION_LANGUAGE.key)