You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2015/11/05 04:28:10 UTC
mahout git commit: MAHOUT-1778: Mahout Spark Shell doesn't work with
Spark > 1.3, this closes apache/mahout#164
Repository: mahout
Updated Branches:
refs/heads/master 82e78a8c9 -> e0b8b90e9
MAHOUT-1778: Mahout Spark Shell doesn't work with Spark > 1.3, this closes apache/mahout#164
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/e0b8b90e
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/e0b8b90e
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/e0b8b90e
Branch: refs/heads/master
Commit: e0b8b90e91c2ea72e2562d504ad16051cf759787
Parents: 82e78a8
Author: smarthi <sm...@apache.org>
Authored: Wed Nov 4 22:27:36 2015 -0500
Committer: smarthi <sm...@apache.org>
Committed: Wed Nov 4 22:27:36 2015 -0500
----------------------------------------------------------------------
bin/compute-classpath.sh | 186 +++++++++++++++++++
bin/mahout | 10 +-
bin/mahout-load-spark-env.sh | 40 ++++
bin/mahout-spark-class.sh | 80 ++++++++
pom.xml | 4 +-
.../sparkbindings/shell/MahoutSparkILoop.scala | 2 +-
6 files changed, 318 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0b8b90e/bin/compute-classpath.sh
----------------------------------------------------------------------
diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
new file mode 100755
index 0000000..79898e4
--- /dev/null
+++ b/bin/compute-classpath.sh
@@ -0,0 +1,186 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This script computes Spark's classpath and prints it to stdout; it's used by both the "run"
+# script and the ExecutorRunner in standalone cluster mode.
+
+# Figure out where Spark is installed
+#FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
+FWDIR="$SPARK_HOME"
+
+#. "$FWDIR"/bin/load-spark-env.sh # not executable by defult in $SPARK_HOME/bin
+
+"$MAHOUT_HOME"/bin/mahout-load-spark-env.sh
+
+# compute the Scala version Note: though Mahout has not bee tested with Scala 2.11
+# Setting SPARK_SCALA_VERSION if not already set.
+
+if [ -z "$SPARK_SCALA_VERSION" ]; then
+
+ ASSEMBLY_DIR2="$FWDIR/assembly/target/scala-2.11"
+ ASSEMBLY_DIR1="$FWDIR/assembly/target/scala-2.10"
+
+ if [[ -d "$ASSEMBLY_DIR2" && -d "$ASSEMBLY_DIR1" ]]; then
+ echo -e "Presence of build for both scala versions(SCALA 2.10 and SCALA 2.11) detected." 1>&2
+ echo -e 'Either clean one of them or, export SPARK_SCALA_VERSION=2.11 in spark-env.sh.' 1>&2
+ exit 1
+ fi
+
+ if [ -d "$ASSEMBLY_DIR2" ]; then
+ export SPARK_SCALA_VERSION="2.11"
+ else
+ export SPARK_SCALA_VERSION="2.10"
+ fi
+fi
+
+
+function appendToClasspath(){
+ if [ -n "$1" ]; then
+ if [ -n "$CLASSPATH" ]; then
+ CLASSPATH="$CLASSPATH:$1"
+ else
+ CLASSPATH="$1"
+ fi
+ fi
+}
+
+appendToClasspath "$SPARK_CLASSPATH"
+appendToClasspath "$SPARK_SUBMIT_CLASSPATH"
+
+# Build up classpath
+if [ -n "$SPARK_CONF_DIR" ]; then
+ appendToClasspath "$SPARK_CONF_DIR"
+else
+ appendToClasspath "$FWDIR/conf"
+fi
+
+ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SPARK_SCALA_VERSION"
+
+if [ -n "$JAVA_HOME" ]; then
+ JAR_CMD="$JAVA_HOME/bin/jar"
+else
+ JAR_CMD="jar"
+fi
+
+# A developer option to prepend more recently compiled Spark classes
+if [ -n "$SPARK_PREPEND_CLASSES" ]; then
+ echo "NOTE: SPARK_PREPEND_CLASSES is set, placing locally compiled Spark"\
+ "classes ahead of assembly." >&2
+ # Spark classes
+ appendToClasspath "$FWDIR/core/target/scala-$SPARK_SCALA_VERSION/classes"
+ appendToClasspath "$FWDIR/repl/target/scala-$SPARK_SCALA_VERSION/classes"
+ appendToClasspath "$FWDIR/mllib/target/scala-$SPARK_SCALA_VERSION/classes"
+ appendToClasspath "$FWDIR/bagel/target/scala-$SPARK_SCALA_VERSION/classes"
+ appendToClasspath "$FWDIR/graphx/target/scala-$SPARK_SCALA_VERSION/classes"
+ appendToClasspath "$FWDIR/streaming/target/scala-$SPARK_SCALA_VERSION/classes"
+ appendToClasspath "$FWDIR/tools/target/scala-$SPARK_SCALA_VERSION/classes"
+ appendToClasspath "$FWDIR/sql/catalyst/target/scala-$SPARK_SCALA_VERSION/classes"
+ appendToClasspath "$FWDIR/sql/core/target/scala-$SPARK_SCALA_VERSION/classes"
+ appendToClasspath "$FWDIR/sql/hive/target/scala-$SPARK_SCALA_VERSION/classes"
+ appendToClasspath "$FWDIR/sql/hive-thriftserver/target/scala-$SPARK_SCALA_VERSION/classes"
+ appendToClasspath "$FWDIR/yarn/stable/target/scala-$SPARK_SCALA_VERSION/classes"
+ # Jars for shaded deps in their original form (copied here during build)
+ appendToClasspath "$FWDIR/core/target/jars/*"
+fi
+
+# Use spark-assembly jar from either RELEASE or assembly directory
+if [ -f "$FWDIR/RELEASE" ]; then
+ assembly_folder="$FWDIR"/lib
+else
+ assembly_folder="$ASSEMBLY_DIR"
+fi
+
+num_jars=0
+
+for f in "${assembly_folder}"/spark-assembly*hadoop*.jar; do
+ if [[ ! -e "$f" ]]; then
+ echo "Failed to find Spark assembly in $assembly_folder" 1>&2
+ echo "You need to build Spark before running this program." 1>&2
+ exit 1
+ fi
+ ASSEMBLY_JAR="$f"
+ num_jars=$((num_jars+1))
+done
+
+if [ "$num_jars" -gt "1" ]; then
+ echo "Found multiple Spark assembly jars in $assembly_folder:" 1>&2
+ ls "${assembly_folder}"/spark-assembly*hadoop*.jar 1>&2
+ echo "Please remove all but one jar." 1>&2
+ exit 1
+fi
+
+# Only able to make this check if 'jar' command is available
+if [ $(command -v "$JAR_CMD") ] ; then
+ # Verify that versions of java used to build the jars and run Spark are compatible
+ jar_error_check=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" nonexistent/class/path 2>&1)
+ if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then
+ echo "Loading Spark jar with '$JAR_CMD' failed. " 1>&2
+ echo "This is likely because Spark was compiled with Java 7 and run " 1>&2
+ echo "with Java 6. (see SPARK-1703). Please use Java 7 to run Spark " 1>&2
+ echo "or build Spark with Java 6." 1>&2
+ exit 1
+ fi
+fi
+
+appendToClasspath "$ASSEMBLY_JAR"
+
+# When Hive support is needed, Datanucleus jars must be included on the classpath.
+# Datanucleus jars do not work if only included in the uber jar as plugin.xml metadata is lost.
+# Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is
+# built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
+# assembly is built for Hive, before actually populating the CLASSPATH with the jars.
+# Note that this check order is faster (by up to half a second) in the case where Hive is not used.
+if [ -f "$FWDIR/RELEASE" ]; then
+ datanucleus_dir="$FWDIR"/lib
+else
+ datanucleus_dir="$FWDIR"/lib_managed/jars
+fi
+
+datanucleus_jars="$(find "$datanucleus_dir" 2>/dev/null | grep "datanucleus-.*\\.jar$")"
+datanucleus_jars="$(echo "$datanucleus_jars" | tr "\n" : | sed s/:$//g)"
+
+if [ -n "$datanucleus_jars" ]; then
+ appendToClasspath "$datanucleus_jars"
+fi
+
+# Add test classes if we're running from SBT or Maven with SPARK_TESTING set to 1
+if [[ $SPARK_TESTING == 1 ]]; then
+ appendToClasspath "$FWDIR/core/target/scala-$SPARK_SCALA_VERSION/test-classes"
+ appendToClasspath "$FWDIR/repl/target/scala-$SPARK_SCALA_VERSION/test-classes"
+ appendToClasspath "$FWDIR/mllib/target/scala-$SPARK_SCALA_VERSION/test-classes"
+ appendToClasspath "$FWDIR/bagel/target/scala-$SPARK_SCALA_VERSION/test-classes"
+ appendToClasspath "$FWDIR/graphx/target/scala-$SPARK_SCALA_VERSION/test-classes"
+ appendToClasspath "$FWDIR/streaming/target/scala-$SPARK_SCALA_VERSION/test-classes"
+ appendToClasspath "$FWDIR/sql/catalyst/target/scala-$SPARK_SCALA_VERSION/test-classes"
+ appendToClasspath "$FWDIR/sql/core/target/scala-$SPARK_SCALA_VERSION/test-classes"
+ appendToClasspath "$FWDIR/sql/hive/target/scala-$SPARK_SCALA_VERSION/test-classes"
+fi
+
+# Add hadoop conf dir if given -- otherwise FileSystem.*, etc fail !
+# Note, this assumes that there is either a HADOOP_CONF_DIR or YARN_CONF_DIR which hosts
+# the configurtion files.
+appendToClasspath "$HADOOP_CONF_DIR"
+appendToClasspath "$YARN_CONF_DIR"
+
+# To allow for distributions to append needed libraries to the classpath (e.g. when
+# using the "hadoop-provided" profile to build Spark), check SPARK_DIST_CLASSPATH and
+# append it to tbe final classpath.
+appendToClasspath "$SPARK_DIST_CLASSPATH"
+
+echo "$CLASSPATH"
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0b8b90e/bin/mahout
----------------------------------------------------------------------
diff --git a/bin/mahout b/bin/mahout
index 24f01ba..b16d51b 100755
--- a/bin/mahout
+++ b/bin/mahout
@@ -211,7 +211,7 @@ then
CLASSPATH=${CLASSPATH}:$f;
done
- SPARK_CP_BIN="${SPARK_HOME}/bin/compute-classpath.sh"
+ SPARK_CP_BIN="${MAHOUT_HOME}/bin/compute-classpath.sh"
if [ -x "${SPARK_CP_BIN}" ]; then
SPARK_CLASSPATH=$("${SPARK_CP_BIN}" 2>/dev/null)
CLASSPATH="${CLASSPATH}:${SPARK_CLASSPATH}"
@@ -220,6 +220,14 @@ then
exit -1
fi
+ SPARK_ASSEBMLY_BIN="${MAHOUT_HOME}/bin/mahout-spark-class.sh"
+ if [ -x "${SPARK_ASSEBMLY_BIN}" ]; then
+ SPARK_ASSEMBLY_CLASSPATH=$("${SPARK_ASSEBMLY_BIN}" 2>/dev/null)
+ CLASSPATH="${CLASSPATH}:${SPARK_ASSEBMLY_BIN}"
+ else
+ echo "Cannot find Spark assembly classpath. Is 'SPARK_HOME' set?"
+ exit -1
+ fi
fi
# add release dependencies to CLASSPATH
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0b8b90e/bin/mahout-load-spark-env.sh
----------------------------------------------------------------------
diff --git a/bin/mahout-load-spark-env.sh b/bin/mahout-load-spark-env.sh
new file mode 100755
index 0000000..533eecf
--- /dev/null
+++ b/bin/mahout-load-spark-env.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This script loads spark-env.sh if it exists, and ensures it is only loaded once.
+# spark-env.sh is loaded from SPARK_CONF_DIR if set, or within the current directory's
+# conf/ subdirectory.
+FWDIR="$SPARK_HOME"
+
+if [ -z "$SPARK_ENV_LOADED" ]; then
+ export SPARK_ENV_LOADED=1
+
+ # Returns the parent of the directory this script lives in.
+ parent_dir="$(cd "`dirname "$0"`"/..; pwd)"
+
+ user_conf_dir="${SPARK_CONF_DIR:-"$parent_dir"/conf}"
+
+ if [ -f "${user_conf_dir}/spark-env.sh" ]; then
+ # Promote all variable declarations to environment (exported) variables
+ set -a
+ . "${user_conf_dir}/spark-env.sh"
+ set +a
+ fi
+fi
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0b8b90e/bin/mahout-spark-class.sh
----------------------------------------------------------------------
diff --git a/bin/mahout-spark-class.sh b/bin/mahout-spark-class.sh
new file mode 100755
index 0000000..ef88829
--- /dev/null
+++ b/bin/mahout-spark-class.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Figure out where Spark is installed
+#export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+
+#"$SPARK_HOME"/bin/load-spark-env.sh # not executable by defult in $SPARK_HOME/bin
+"$MAHOUT_HOME"/bin/mahout-load-spark-env.sh
+
+# Find the java binary
+if [ -n "${JAVA_HOME}" ]; then
+ RUNNER="${JAVA_HOME}/bin/java"
+else
+ if [ `command -v java` ]; then
+ RUNNER="java"
+ else
+ echo "JAVA_HOME is not set" >&2
+ exit 1
+ fi
+fi
+
+# Find assembly jar
+SPARK_ASSEMBLY_JAR=
+if [ -f "$SPARK_HOME/RELEASE" ]; then
+ ASSEMBLY_DIR="$SPARK_HOME/lib"
+else
+ ASSEMBLY_DIR="$SPARK_HOME/assembly/target/scala-$SPARK_SCALA_VERSION"
+fi
+
+num_jars="$(ls -1 "$ASSEMBLY_DIR" | grep "^spark-assembly.*hadoop.*\.jar$" | wc -l)"
+if [ "$num_jars" -eq "0" -a -z "$SPARK_ASSEMBLY_JAR" ]; then
+ echo "Failed to find Spark assembly in $ASSEMBLY_DIR." 1>&2
+ echo "You need to build Spark before running this program." 1>&2
+ exit 1
+fi
+ASSEMBLY_JARS="$(ls -1 "$ASSEMBLY_DIR" | grep "^spark-assembly.*hadoop.*\.jar$" || true)"
+if [ "$num_jars" -gt "1" ]; then
+ echo "Found multiple Spark assembly jars in $ASSEMBLY_DIR:" 1>&2
+ echo "$ASSEMBLY_JARS" 1>&2
+ echo "Please remove all but one jar." 1>&2
+ exit 1
+fi
+
+SPARK_ASSEMBLY_JAR="${ASSEMBLY_DIR}/${ASSEMBLY_JARS}"
+
+LAUNCH_CLASSPATH="$SPARK_ASSEMBLY_JAR"
+
+# Add the launcher build dir to the classpath if requested.
+if [ -n "$SPARK_PREPEND_CLASSES" ]; then
+ LAUNCH_CLASSPATH="$SPARK_HOME/launcher/target/scala-$SPARK_SCALA_VERSION/classes:$LAUNCH_CLASSPATH"
+fi
+
+export _SPARK_ASSEMBLY="$SPARK_ASSEMBLY_JAR"
+
+echo $LAUNCH_CLASSPATH
+
+# The launcher library will print arguments separated by a NULL character, to allow arguments with
+# characters that would be otherwise interpreted by the shell. Read that in a while loop, populating
+# an array that will be used to exec the final command.
+#CMD=()
+#while IFS= read -d '' -r ARG; do
+# CMD+=("$ARG")
+#done < <("$RUNNER" -cp "$LAUNCH_CLASSPATH" org.apache.spark.launcher.Main "$@")
+#exec "${CMD[@]}"
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0b8b90e/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 1c3b376..c8e6874 100644
--- a/pom.xml
+++ b/pom.xml
@@ -117,10 +117,10 @@
<mscala.version>3.2.0</mscala.version>
<hbase.version>1.0.0</hbase.version>
<lucene.version>4.6.1</lucene.version>
- <slf4j.version>1.7.10</slf4j.version>
+ <slf4j.version>1.7.12</slf4j.version>
<scala.compat.version>2.10</scala.compat.version>
<scala.version>2.10.4</scala.version>
- <spark.version>1.3.1</spark.version>
+ <spark.version>1.4.1</spark.version>
<h2o.version>0.1.25</h2o.version>
</properties>
<issueManagement>
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0b8b90e/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
----------------------------------------------------------------------
diff --git a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala b/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
index 8df93bd..4770cde 100644
--- a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
+++ b/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
@@ -157,7 +157,7 @@ class MahoutSparkILoop extends SparkILoop {
_ __ ___ __ _| |__ ___ _ _| |_
| '_ ` _ \ / _` | '_ \ / _ \| | | | __|
| | | | | | (_| | | | | (_) | |_| | |_
- |_| |_| |_|\__,_|_| |_|\___/ \__,_|\__| version 0.11.0
+ |_| |_| |_|\__,_|_| |_|\___/ \__,_|\__| version 0.11.1
""")
import Properties._