You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by hv...@apache.org on 2023/03/07 12:34:59 UTC

[spark] branch master updated: [SPARK-42656][CONNECT][FOLLOWUP] Spark Connect Shell

This is an automated email from the ASF dual-hosted git repository.

hvanhovell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 2e7207f96e1 [SPARK-42656][CONNECT][FOLLOWUP] Spark Connect Shell
2e7207f96e1 is described below

commit 2e7207f96e1ff848def135de63f63bcda7402517
Author: Zhen Li <zh...@users.noreply.github.com>
AuthorDate: Tue Mar 7 08:34:40 2023 -0400

    [SPARK-42656][CONNECT][FOLLOWUP] Spark Connect Shell
    
    ### What changes were proposed in this pull request?
    Add spark connect shell to start the spark shell with spark connect enabled.
    Added "-Pconnect" to build the spark connect in the distributions.
    Simplified the dev shell scripts with "-Pconnect" command.
    
    ### Why are the changes needed?
    Allow users to play with spark connect easily.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes. Added a new shell script and "-Pconnect" build option.
    
    ### How was this patch tested?
    Manually tested.
    
    Closes #40305 from zhenlineo/connect-shell.
    
    Authored-by: Zhen Li <zh...@users.noreply.github.com>
    Signed-off-by: Herman van Hovell <he...@databricks.com>
---
 assembly/pom.xml                                       | 10 ++++++++++
 .../bin/spark-connect => bin/spark-connect-shell       | 18 ++++++------------
 connector/connect/bin/spark-connect                    | 13 ++++++-------
 connector/connect/bin/spark-connect-scala-client.sc    |  1 +
 .../connect/bin/{spark-connect => spark-connect-shell} | 15 +++++++--------
 docs/building-spark.md                                 |  4 ++++
 .../main/scala-2.12/org/apache/spark/repl/Main.scala   |  5 +++++
 7 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index aee572b64a8..36cc6078438 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -152,6 +152,16 @@
         </dependency>
       </dependencies>
     </profile>
+    <profile>
+      <id>connect</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.spark</groupId>
+          <artifactId>spark-connect_${scala.binary.version}</artifactId>
+          <version>${project.version}</version>
+        </dependency>
+      </dependencies>
+    </profile>
     <profile>
       <id>kubernetes</id>
       <dependencies>
diff --git a/connector/connect/bin/spark-connect b/bin/spark-connect-shell
similarity index 53%
copy from connector/connect/bin/spark-connect
copy to bin/spark-connect-shell
index 2f2ce7df08c..9026c81e70d 100755
--- a/connector/connect/bin/spark-connect
+++ b/bin/spark-connect-shell
@@ -17,17 +17,11 @@
 # limitations under the License.
 #
 
-# Go to the Spark project root directory
-FWDIR="$(cd "`dirname "$0"`"/../../..; pwd)"
-cd "$FWDIR"
-export SPARK_HOME=$FWDIR
+# The shell script to start a spark-shell with spark connect enabled.
 
-SCALA_BINARY_VER=`grep "scala.binary.version" "${SPARK_HOME}/pom.xml" | head -n1 | awk -F '[<>]' '{print $3}'`
-SCALA_ARG=$(if [ "${SCALA_BINARY_VER}" == "2.13" ]; then echo "-Pscala-2.13"; else echo ""; fi)
+if [ -z "${SPARK_HOME}" ]; then
+  source "$(dirname "$0")"/find-spark-home
+fi
 
-# Build the jars needed for spark submit and spark connect
-build/sbt "${SCALA_ARG}" -Phive package
-
-CONNECT_JAR=`ls "${SPARK_HOME}"/connector/connect/server/target/scala-"${SCALA_BINARY_VER}"/spark-connect-assembly*.jar | paste -sd ',' -`
-
-exec "${SPARK_HOME}"/bin/spark-submit "$@" --class org.apache.spark.sql.connect.SimpleSparkConnectService "$CONNECT_JAR"
\ No newline at end of file
+# This requires building the spark with `-Pconnect`, e,g, `build/sbt -Pconnect package`
+exec "${SPARK_HOME}"/bin/spark-shell --conf spark.plugins=org.apache.spark.sql.connect.SparkConnectPlugin "$@"
\ No newline at end of file
diff --git a/connector/connect/bin/spark-connect b/connector/connect/bin/spark-connect
index 2f2ce7df08c..62d0d36b441 100755
--- a/connector/connect/bin/spark-connect
+++ b/connector/connect/bin/spark-connect
@@ -17,17 +17,16 @@
 # limitations under the License.
 #
 
+# Start the spark-connect with server logs printed in the standard output. The script rebuild the
+# server dependencies and start the server at the default port. This can be used to debug client
+# during client development.
+
 # Go to the Spark project root directory
 FWDIR="$(cd "`dirname "$0"`"/../../..; pwd)"
 cd "$FWDIR"
 export SPARK_HOME=$FWDIR
 
-SCALA_BINARY_VER=`grep "scala.binary.version" "${SPARK_HOME}/pom.xml" | head -n1 | awk -F '[<>]' '{print $3}'`
-SCALA_ARG=$(if [ "${SCALA_BINARY_VER}" == "2.13" ]; then echo "-Pscala-2.13"; else echo ""; fi)
-
 # Build the jars needed for spark submit and spark connect
-build/sbt "${SCALA_ARG}" -Phive package
-
-CONNECT_JAR=`ls "${SPARK_HOME}"/connector/connect/server/target/scala-"${SCALA_BINARY_VER}"/spark-connect-assembly*.jar | paste -sd ',' -`
+build/sbt -Phive -Pconnect package
 
-exec "${SPARK_HOME}"/bin/spark-submit "$@" --class org.apache.spark.sql.connect.SimpleSparkConnectService "$CONNECT_JAR"
\ No newline at end of file
+exec "${SPARK_HOME}"/bin/spark-submit --class org.apache.spark.sql.connect.SimpleSparkConnectService "$@"
\ No newline at end of file
diff --git a/connector/connect/bin/spark-connect-scala-client.sc b/connector/connect/bin/spark-connect-scala-client.sc
index a8d1856498c..9cb4f92417d 100644
--- a/connector/connect/bin/spark-connect-scala-client.sc
+++ b/connector/connect/bin/spark-connect-scala-client.sc
@@ -22,6 +22,7 @@ val sessionBuilder = SparkSession.builder()
 val spark = if (conStr.isEmpty) sessionBuilder.build() else sessionBuilder.remote(conStr).build()
 import spark.implicits._
 import spark.sql
+println("Spark session available as 'spark'.")
 println(
   """
     |   _____                  __      ______                            __
diff --git a/connector/connect/bin/spark-connect b/connector/connect/bin/spark-connect-shell
similarity index 62%
copy from connector/connect/bin/spark-connect
copy to connector/connect/bin/spark-connect-shell
index 2f2ce7df08c..b31ba1bf140 100755
--- a/connector/connect/bin/spark-connect
+++ b/connector/connect/bin/spark-connect-shell
@@ -17,17 +17,16 @@
 # limitations under the License.
 #
 
+# The spark connect shell for development. This shell script builds the spark connect server with
+# all dependencies and starts the server at the default port.
+# Use `/bin/spark-connect-shell` instead if rebuilding the dependency jars are not needed.
+
 # Go to the Spark project root directory
 FWDIR="$(cd "`dirname "$0"`"/../../..; pwd)"
 cd "$FWDIR"
 export SPARK_HOME=$FWDIR
 
-SCALA_BINARY_VER=`grep "scala.binary.version" "${SPARK_HOME}/pom.xml" | head -n1 | awk -F '[<>]' '{print $3}'`
-SCALA_ARG=$(if [ "${SCALA_BINARY_VER}" == "2.13" ]; then echo "-Pscala-2.13"; else echo ""; fi)
-
-# Build the jars needed for spark submit and spark connect
-build/sbt "${SCALA_ARG}" -Phive package
-
-CONNECT_JAR=`ls "${SPARK_HOME}"/connector/connect/server/target/scala-"${SCALA_BINARY_VER}"/spark-connect-assembly*.jar | paste -sd ',' -`
+# Build the jars needed for spark shell and spark connect
+build/sbt -Phive -Pconnect package
 
-exec "${SPARK_HOME}"/bin/spark-submit "$@" --class org.apache.spark.sql.connect.SimpleSparkConnectService "$CONNECT_JAR"
\ No newline at end of file
+exec "${SPARK_HOME}"/bin/spark-shell --conf spark.plugins=org.apache.spark.sql.connect.SparkConnectPlugin "$@"
\ No newline at end of file
diff --git a/docs/building-spark.md b/docs/building-spark.md
index 49359fedee9..8487c482615 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -119,6 +119,10 @@ For instance, you can build the Spark Streaming module using:
 
 where `spark-streaming_{{site.SCALA_BINARY_VERSION}}` is the `artifactId` as defined in `streaming/pom.xml` file.
 
+## Building with Spark Connect support
+
+    ./build/mvn -Pconnect -DskipTests clean package
+
 ## Continuous Compilation
 
 We use the scala-maven-plugin which supports incremental and continuous compilation. E.g.
diff --git a/repl/src/main/scala-2.12/org/apache/spark/repl/Main.scala b/repl/src/main/scala-2.12/org/apache/spark/repl/Main.scala
index a68b112ed2b..eaca4ad6ee2 100644
--- a/repl/src/main/scala-2.12/org/apache/spark/repl/Main.scala
+++ b/repl/src/main/scala-2.12/org/apache/spark/repl/Main.scala
@@ -121,6 +121,11 @@ object Main extends Logging {
       sparkContext = sparkSession.sparkContext
       sparkSession
     } catch {
+      case e: ClassNotFoundException if isShellSession && e.getMessage.contains(
+        "org.apache.spark.sql.connect.SparkConnectPlugin") =>
+        logError("Failed to load spark connect plugin.")
+        logError("You need to build Spark with -Pconnect.")
+        sys.exit(1)
       case e: Exception if isShellSession =>
         logError("Failed to initialize Spark session.", e)
         sys.exit(1)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org