You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by pw...@apache.org on 2014/05/09 07:26:45 UTC

git commit: SPARK-1565 (Addendum): Replace `run-example` with `spark-submit`.

Repository: spark
Updated Branches:
  refs/heads/master 3f779d872 -> 06b15baab


SPARK-1565 (Addendum): Replace `run-example` with `spark-submit`.

Gives a nicely formatted message to the user when `run-example` is run to
tell them to use `spark-submit`.

Author: Patrick Wendell <pw...@gmail.com>

Closes #704 from pwendell/examples and squashes the following commits:

1996ee8 [Patrick Wendell] Feedback form Andrew
3eb7803 [Patrick Wendell] Suggestions from TD
2474668 [Patrick Wendell] SPARK-1565 (Addendum): Replace `run-example` with `spark-submit`.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/06b15baa
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/06b15baa
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/06b15baa

Branch: refs/heads/master
Commit: 06b15baab25951d124bbe6b64906f4139e037deb
Parents: 3f779d8
Author: Patrick Wendell <pw...@gmail.com>
Authored: Thu May 8 22:26:17 2014 -0700
Committer: Patrick Wendell <pw...@gmail.com>
Committed: Thu May 8 22:26:36 2014 -0700

----------------------------------------------------------------------
 README.md                                       | 19 ++++--
 bin/pyspark                                     |  2 +-
 bin/run-example                                 | 71 +++++---------------
 bin/spark-class                                 |  2 +-
 .../main/scala/org/apache/spark/rdd/RDD.scala   |  4 +-
 docs/running-on-yarn.md                         |  2 +-
 make-distribution.sh                            |  2 +
 7 files changed, 37 insertions(+), 65 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/06b15baa/README.md
----------------------------------------------------------------------
diff --git a/README.md b/README.md
index e2d1dcb..9c2e32b 100644
--- a/README.md
+++ b/README.md
@@ -39,17 +39,22 @@ And run the following command, which should also return 1000:
 ## Example Programs
 
 Spark also comes with several sample programs in the `examples` directory.
-To run one of them, use `./bin/run-example <class> <params>`. For example:
+To run one of them, use `./bin/run-example <class> [params]`. For example:
 
-    ./bin/run-example org.apache.spark.examples.SparkLR local[2]
+    ./bin/run-example org.apache.spark.examples.SparkLR
 
-will run the Logistic Regression example locally on 2 CPUs.
+will run the Logistic Regression example locally.
 
-Each of the example programs prints usage help if no params are given.
+You can set the MASTER environment variable when running examples to submit
+examples to a cluster. This can be a mesos:// or spark:// URL, 
+"yarn-cluster" or "yarn-client" to run on YARN, and "local" to run 
+locally with one thread, or "local[N]" to run locally with N threads. You 
+can also use an abbreviated class name if the class is in the `examples`
+package. For instance:
 
-All of the Spark samples take a `<master>` parameter that is the cluster URL
-to connect to. This can be a mesos:// or spark:// URL, or "local" to run
-locally with one thread, or "local[N]" to run locally with N threads.
+    MASTER=spark://host:7077 ./bin/run-example SparkPi
+
+Many of the example programs print usage help if no params are given.
 
 ## Running Tests
 

http://git-wip-us.apache.org/repos/asf/spark/blob/06b15baa/bin/pyspark
----------------------------------------------------------------------
diff --git a/bin/pyspark b/bin/pyspark
index f555885..10e35e0 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -31,7 +31,7 @@ if [ ! -f "$FWDIR/RELEASE" ]; then
   ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*.jar >& /dev/null
   if [[ $? != 0 ]]; then
     echo "Failed to find Spark assembly in $FWDIR/assembly/target" >&2
-    echo "You need to build Spark with sbt/sbt assembly before running this program" >&2
+    echo "You need to build Spark before running this program" >&2
     exit 1
   fi
 fi

http://git-wip-us.apache.org/repos/asf/spark/blob/06b15baa/bin/run-example
----------------------------------------------------------------------
diff --git a/bin/run-example b/bin/run-example
index d8a94f2..146951a 100755
--- a/bin/run-example
+++ b/bin/run-example
@@ -17,28 +17,10 @@
 # limitations under the License.
 #
 
-cygwin=false
-case "`uname`" in
-    CYGWIN*) cygwin=true;;
-esac
-
 SCALA_VERSION=2.10
 
-# Figure out where the Scala framework is installed
 FWDIR="$(cd `dirname $0`/..; pwd)"
-
-# Export this as SPARK_HOME
 export SPARK_HOME="$FWDIR"
-
-. $FWDIR/bin/load-spark-env.sh
-
-if [ -z "$1" ]; then
-  echo "Usage: run-example <example-class> [<args>]" >&2
-  exit 1
-fi
-
-# Figure out the JAR file that our examples were packaged into. This includes a bit of a hack
-# to avoid the -sources and -doc packages that are built by publish-local.
 EXAMPLES_DIR="$FWDIR"/examples
 
 if [ -f "$FWDIR/RELEASE" ]; then
@@ -49,46 +31,29 @@ fi
 
 if [[ -z $SPARK_EXAMPLES_JAR ]]; then
   echo "Failed to find Spark examples assembly in $FWDIR/lib or $FWDIR/examples/target" >&2
-  echo "You need to build Spark with sbt/sbt assembly before running this program" >&2
+  echo "You need to build Spark before running this program" >&2
   exit 1
 fi
 
+EXAMPLE_MASTER=${MASTER:-"local[*]"}
 
-# Since the examples JAR ideally shouldn't include spark-core (that dependency should be
-# "provided"), also add our standard Spark classpath, built using compute-classpath.sh.
-CLASSPATH=`$FWDIR/bin/compute-classpath.sh`
-CLASSPATH="$SPARK_EXAMPLES_JAR:$CLASSPATH"
-
-if $cygwin; then
-    CLASSPATH=`cygpath -wp $CLASSPATH`
-    export SPARK_EXAMPLES_JAR=`cygpath -w $SPARK_EXAMPLES_JAR`
-fi
-
-# Find java binary
-if [ -n "${JAVA_HOME}" ]; then
-  RUNNER="${JAVA_HOME}/bin/java"
-else
-  if [ `command -v java` ]; then
-    RUNNER="java"
-  else
-    echo "JAVA_HOME is not set" >&2
-    exit 1
-  fi
-fi
-
-# Set JAVA_OPTS to be able to load native libraries and to set heap size
-JAVA_OPTS="$SPARK_JAVA_OPTS"
-# Load extra JAVA_OPTS from conf/java-opts, if it exists
-if [ -e "$FWDIR/conf/java-opts" ] ; then
-  JAVA_OPTS="$JAVA_OPTS `cat $FWDIR/conf/java-opts`"
+if [ -n "$1" ]; then
+  EXAMPLE_CLASS="$1"
+  shift
+else 
+  echo "usage: ./bin/run-example <example-class> [example-args]" 
+  echo "  - set MASTER=XX to use a specific master"
+  echo "  - can use abbreviated example class name (e.g. SparkPi, mllib.MovieLensALS)"
+  echo
+  exit -1
 fi
-export JAVA_OPTS
 
-if [ "$SPARK_PRINT_LAUNCH_COMMAND" == "1" ]; then
-  echo -n "Spark Command: "
-  echo "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
-  echo "========================================"
-  echo
+if [[ ! $EXAMPLE_CLASS == org.apache.spark.examples* ]]; then
+  EXAMPLE_CLASS="org.apache.spark.examples.$EXAMPLE_CLASS"
 fi
 
-exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
+./bin/spark-submit \
+  --master $EXAMPLE_MASTER \
+  --class $EXAMPLE_CLASS \
+  $SPARK_EXAMPLES_JAR \
+  "$@"

http://git-wip-us.apache.org/repos/asf/spark/blob/06b15baa/bin/spark-class
----------------------------------------------------------------------
diff --git a/bin/spark-class b/bin/spark-class
index 72f8b9b..6480ccb 100755
--- a/bin/spark-class
+++ b/bin/spark-class
@@ -114,7 +114,7 @@ if [ ! -f "$FWDIR/RELEASE" ]; then
   jars_list=$(ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/ | grep "spark-assembly.*hadoop.*.jar")
   if [ "$num_jars" -eq "0" ]; then
     echo "Failed to find Spark assembly in $FWDIR/assembly/target/scala-$SCALA_VERSION/" >&2
-    echo "You need to build Spark with 'sbt/sbt assembly' before running this program." >&2
+    echo "You need to build Spark before running this program." >&2
     exit 1
   fi
   if [ "$num_jars" -gt "1" ]; then

http://git-wip-us.apache.org/repos/asf/spark/blob/06b15baa/core/src/main/scala/org/apache/spark/rdd/RDD.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index a1ca612..9d8d804 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -330,9 +330,9 @@ abstract class RDD[T: ClassTag](
     if (shuffle) {
       // include a shuffle step so that our upstream tasks are still distributed
       new CoalescedRDD(
-        new ShuffledRDD[T, Null, (T, Null)](map(x => (x, null)),
+        new ShuffledRDD[Int, T, (Int, T)](map(x => (Utils.random.nextInt(), x)),
         new HashPartitioner(numPartitions)),
-        numPartitions).keys
+        numPartitions).values
     } else {
       new CoalescedRDD(this, numPartitions)
     }

http://git-wip-us.apache.org/repos/asf/spark/blob/06b15baa/docs/running-on-yarn.md
----------------------------------------------------------------------
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index 68183ee..c563594 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -53,7 +53,7 @@ For example:
         --driver-memory 4g \
         --executor-memory 2g \
         --executor-cores 1
-        examples/target/scala-{{site.SCALA_BINARY_VERSION}}/spark-examples-assembly-{{site.SPARK_VERSION}}.jar \
+        lib/spark-examples*.jar \
         yarn-cluster 5
 
 The above starts a YARN client program which starts the default Application Master. Then SparkPi will be run as a child thread of Application Master. The client will periodically poll the Application Master for status updates and display them in the console. The client will exit once your application has finished running.  Refer to the "Viewing Logs" section below for how to see driver and executor logs.

http://git-wip-us.apache.org/repos/asf/spark/blob/06b15baa/make-distribution.sh
----------------------------------------------------------------------
diff --git a/make-distribution.sh b/make-distribution.sh
index 759e555..1cc2844 100755
--- a/make-distribution.sh
+++ b/make-distribution.sh
@@ -40,6 +40,8 @@
 #
 
 set -o pipefail
+set -e
+
 # Figure out where the Spark framework is installed
 FWDIR="$(cd `dirname $0`; pwd)"
 DISTDIR="$FWDIR/dist"