You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2015/11/05 04:28:10 UTC
mahout git commit: MAHOUT-1778: Mahout Spark Shell doesn't work with Spark > 1.3, this closes apache/mahout#164

Repository: mahout
Updated Branches:
  refs/heads/master 82e78a8c9 -> e0b8b90e9


MAHOUT-1778: Mahout Spark Shell doesn't work with Spark > 1.3, this closes apache/mahout#164


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/e0b8b90e
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/e0b8b90e
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/e0b8b90e

Branch: refs/heads/master
Commit: e0b8b90e91c2ea72e2562d504ad16051cf759787
Parents: 82e78a8
Author: smarthi <sm...@apache.org>
Authored: Wed Nov 4 22:27:36 2015 -0500
Committer: smarthi <sm...@apache.org>
Committed: Wed Nov 4 22:27:36 2015 -0500

----------------------------------------------------------------------
 bin/compute-classpath.sh                        | 186 +++++++++++++++++++
 bin/mahout                                      |  10 +-
 bin/mahout-load-spark-env.sh                    |  40 ++++
 bin/mahout-spark-class.sh                       |  80 ++++++++
 pom.xml                                         |   4 +-
 .../sparkbindings/shell/MahoutSparkILoop.scala  |   2 +-
 6 files changed, 318 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/e0b8b90e/bin/compute-classpath.sh
----------------------------------------------------------------------
diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
new file mode 100755
index 0000000..79898e4
--- /dev/null
+++ b/bin/compute-classpath.sh
@@ -0,0 +1,186 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This script computes Spark's classpath and prints it to stdout; it's used by both the "run"
+# script and the ExecutorRunner in standalone cluster mode.
+
+# Figure out where Spark is installed
+#FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
+FWDIR="$SPARK_HOME"
+
+#. "$FWDIR"/bin/load-spark-env.sh # not executable by defult in $SPARK_HOME/bin
+
+"$MAHOUT_HOME"/bin/mahout-load-spark-env.sh
+
+# compute the Scala version Note: though Mahout has not bee tested with Scala 2.11
+# Setting SPARK_SCALA_VERSION if not already set.
+
+if [ -z "$SPARK_SCALA_VERSION" ]; then
+
+    ASSEMBLY_DIR2="$FWDIR/assembly/target/scala-2.11"
+    ASSEMBLY_DIR1="$FWDIR/assembly/target/scala-2.10"
+
+    if [[ -d "$ASSEMBLY_DIR2" && -d "$ASSEMBLY_DIR1" ]]; then
+        echo -e "Presence of build for both scala versions(SCALA 2.10 and SCALA 2.11) detected." 1>&2
+        echo -e 'Either clean one of them or, export SPARK_SCALA_VERSION=2.11 in spark-env.sh.' 1>&2
+        exit 1
+    fi
+
+    if [ -d "$ASSEMBLY_DIR2" ]; then
+        export SPARK_SCALA_VERSION="2.11"
+    else
+        export SPARK_SCALA_VERSION="2.10"
+    fi
+fi
+
+
+function appendToClasspath(){
+  if [ -n "$1" ]; then
+    if [ -n "$CLASSPATH" ]; then
+      CLASSPATH="$CLASSPATH:$1"
+    else
+      CLASSPATH="$1"
+    fi
+  fi
+}
+
+appendToClasspath "$SPARK_CLASSPATH"
+appendToClasspath "$SPARK_SUBMIT_CLASSPATH"
+
+# Build up classpath
+if [ -n "$SPARK_CONF_DIR" ]; then
+  appendToClasspath "$SPARK_CONF_DIR"
+else
+  appendToClasspath "$FWDIR/conf"
+fi
+
+ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SPARK_SCALA_VERSION"
+
+if [ -n "$JAVA_HOME" ]; then
+  JAR_CMD="$JAVA_HOME/bin/jar"
+else
+  JAR_CMD="jar"
+fi
+
+# A developer option to prepend more recently compiled Spark classes
+if [ -n "$SPARK_PREPEND_CLASSES" ]; then
+  echo "NOTE: SPARK_PREPEND_CLASSES is set, placing locally compiled Spark"\
+    "classes ahead of assembly." >&2
+  # Spark classes
+  appendToClasspath "$FWDIR/core/target/scala-$SPARK_SCALA_VERSION/classes"
+  appendToClasspath "$FWDIR/repl/target/scala-$SPARK_SCALA_VERSION/classes"
+  appendToClasspath "$FWDIR/mllib/target/scala-$SPARK_SCALA_VERSION/classes"
+  appendToClasspath "$FWDIR/bagel/target/scala-$SPARK_SCALA_VERSION/classes"
+  appendToClasspath "$FWDIR/graphx/target/scala-$SPARK_SCALA_VERSION/classes"
+  appendToClasspath "$FWDIR/streaming/target/scala-$SPARK_SCALA_VERSION/classes"
+  appendToClasspath "$FWDIR/tools/target/scala-$SPARK_SCALA_VERSION/classes"
+  appendToClasspath "$FWDIR/sql/catalyst/target/scala-$SPARK_SCALA_VERSION/classes"
+  appendToClasspath "$FWDIR/sql/core/target/scala-$SPARK_SCALA_VERSION/classes"
+  appendToClasspath "$FWDIR/sql/hive/target/scala-$SPARK_SCALA_VERSION/classes"
+  appendToClasspath "$FWDIR/sql/hive-thriftserver/target/scala-$SPARK_SCALA_VERSION/classes"
+  appendToClasspath "$FWDIR/yarn/stable/target/scala-$SPARK_SCALA_VERSION/classes"
+  # Jars for shaded deps in their original form (copied here during build)
+  appendToClasspath "$FWDIR/core/target/jars/*"
+fi
+
+# Use spark-assembly jar from either RELEASE or assembly directory
+if [ -f "$FWDIR/RELEASE" ]; then
+  assembly_folder="$FWDIR"/lib
+else
+  assembly_folder="$ASSEMBLY_DIR"
+fi
+
+num_jars=0
+
+for f in "${assembly_folder}"/spark-assembly*hadoop*.jar; do
+  if [[ ! -e "$f" ]]; then
+    echo "Failed to find Spark assembly in $assembly_folder" 1>&2
+    echo "You need to build Spark before running this program." 1>&2
+    exit 1
+  fi
+  ASSEMBLY_JAR="$f"
+  num_jars=$((num_jars+1))
+done
+
+if [ "$num_jars" -gt "1" ]; then
+  echo "Found multiple Spark assembly jars in $assembly_folder:" 1>&2
+  ls "${assembly_folder}"/spark-assembly*hadoop*.jar 1>&2
+  echo "Please remove all but one jar." 1>&2
+  exit 1
+fi
+
+# Only able to make this check if 'jar' command is available
+if [ $(command -v "$JAR_CMD") ] ; then
+  # Verify that versions of java used to build the jars and run Spark are compatible
+  jar_error_check=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" nonexistent/class/path 2>&1)
+  if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then
+    echo "Loading Spark jar with '$JAR_CMD' failed. " 1>&2
+    echo "This is likely because Spark was compiled with Java 7 and run " 1>&2
+    echo "with Java 6. (see SPARK-1703). Please use Java 7 to run Spark " 1>&2
+    echo "or build Spark with Java 6." 1>&2
+    exit 1
+  fi
+fi
+
+appendToClasspath "$ASSEMBLY_JAR"
+
+# When Hive support is needed, Datanucleus jars must be included on the classpath.
+# Datanucleus jars do not work if only included in the uber jar as plugin.xml metadata is lost.
+# Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is
+# built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
+# assembly is built for Hive, before actually populating the CLASSPATH with the jars.
+# Note that this check order is faster (by up to half a second) in the case where Hive is not used.
+if [ -f "$FWDIR/RELEASE" ]; then
+  datanucleus_dir="$FWDIR"/lib
+else
+  datanucleus_dir="$FWDIR"/lib_managed/jars
+fi
+
+datanucleus_jars="$(find "$datanucleus_dir" 2>/dev/null | grep "datanucleus-.*\\.jar$")"
+datanucleus_jars="$(echo "$datanucleus_jars" | tr "\n" : | sed s/:$//g)"
+
+if [ -n "$datanucleus_jars" ]; then
+  appendToClasspath "$datanucleus_jars"
+fi
+
+# Add test classes if we're running from SBT or Maven with SPARK_TESTING set to 1
+if [[ $SPARK_TESTING == 1 ]]; then
+  appendToClasspath "$FWDIR/core/target/scala-$SPARK_SCALA_VERSION/test-classes"
+  appendToClasspath "$FWDIR/repl/target/scala-$SPARK_SCALA_VERSION/test-classes"
+  appendToClasspath "$FWDIR/mllib/target/scala-$SPARK_SCALA_VERSION/test-classes"
+  appendToClasspath "$FWDIR/bagel/target/scala-$SPARK_SCALA_VERSION/test-classes"
+  appendToClasspath "$FWDIR/graphx/target/scala-$SPARK_SCALA_VERSION/test-classes"
+  appendToClasspath "$FWDIR/streaming/target/scala-$SPARK_SCALA_VERSION/test-classes"
+  appendToClasspath "$FWDIR/sql/catalyst/target/scala-$SPARK_SCALA_VERSION/test-classes"
+  appendToClasspath "$FWDIR/sql/core/target/scala-$SPARK_SCALA_VERSION/test-classes"
+  appendToClasspath "$FWDIR/sql/hive/target/scala-$SPARK_SCALA_VERSION/test-classes"
+fi
+
+# Add hadoop conf dir if given -- otherwise FileSystem.*, etc fail !
+# Note, this assumes that there is either a HADOOP_CONF_DIR or YARN_CONF_DIR which hosts
+# the configurtion files.
+appendToClasspath "$HADOOP_CONF_DIR"
+appendToClasspath "$YARN_CONF_DIR"
+
+# To allow for distributions to append needed libraries to the classpath (e.g. when
+# using the "hadoop-provided" profile to build Spark), check SPARK_DIST_CLASSPATH and
+# append it to tbe final classpath.
+appendToClasspath "$SPARK_DIST_CLASSPATH"
+
+echo "$CLASSPATH"

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0b8b90e/bin/mahout
----------------------------------------------------------------------
diff --git a/bin/mahout b/bin/mahout
index 24f01ba..b16d51b 100755
--- a/bin/mahout
+++ b/bin/mahout
@@ -211,7 +211,7 @@ then
        CLASSPATH=${CLASSPATH}:$f;
     done
 
-    SPARK_CP_BIN="${SPARK_HOME}/bin/compute-classpath.sh"
+    SPARK_CP_BIN="${MAHOUT_HOME}/bin/compute-classpath.sh"
     if [ -x "${SPARK_CP_BIN}" ]; then
        SPARK_CLASSPATH=$("${SPARK_CP_BIN}" 2>/dev/null)
        CLASSPATH="${CLASSPATH}:${SPARK_CLASSPATH}"
@@ -220,6 +220,14 @@ then
       exit -1
     fi
 
+    SPARK_ASSEBMLY_BIN="${MAHOUT_HOME}/bin/mahout-spark-class.sh"
+    if [ -x "${SPARK_ASSEBMLY_BIN}" ]; then
+       SPARK_ASSEMBLY_CLASSPATH=$("${SPARK_ASSEBMLY_BIN}" 2>/dev/null)
+       CLASSPATH="${CLASSPATH}:${SPARK_ASSEBMLY_BIN}"
+    else
+      echo "Cannot find Spark assembly classpath. Is 'SPARK_HOME' set?"
+      exit -1
+    fi
   fi
 
   # add release dependencies to CLASSPATH

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0b8b90e/bin/mahout-load-spark-env.sh
----------------------------------------------------------------------
diff --git a/bin/mahout-load-spark-env.sh b/bin/mahout-load-spark-env.sh
new file mode 100755
index 0000000..533eecf
--- /dev/null
+++ b/bin/mahout-load-spark-env.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This script loads spark-env.sh if it exists, and ensures it is only loaded once.
+# spark-env.sh is loaded from SPARK_CONF_DIR if set, or within the current directory's
+# conf/ subdirectory.
+FWDIR="$SPARK_HOME"
+
+if [ -z "$SPARK_ENV_LOADED" ]; then
+  export SPARK_ENV_LOADED=1
+
+  # Returns the parent of the directory this script lives in.
+  parent_dir="$(cd "`dirname "$0"`"/..; pwd)"
+
+  user_conf_dir="${SPARK_CONF_DIR:-"$parent_dir"/conf}"
+
+  if [ -f "${user_conf_dir}/spark-env.sh" ]; then
+    # Promote all variable declarations to environment (exported) variables
+    set -a
+    . "${user_conf_dir}/spark-env.sh"
+    set +a
+  fi
+fi
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0b8b90e/bin/mahout-spark-class.sh
----------------------------------------------------------------------
diff --git a/bin/mahout-spark-class.sh b/bin/mahout-spark-class.sh
new file mode 100755
index 0000000..ef88829
--- /dev/null
+++ b/bin/mahout-spark-class.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Figure out where Spark is installed
+#export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+
+#"$SPARK_HOME"/bin/load-spark-env.sh # not executable by defult in $SPARK_HOME/bin
+"$MAHOUT_HOME"/bin/mahout-load-spark-env.sh
+
+# Find the java binary
+if [ -n "${JAVA_HOME}" ]; then
+  RUNNER="${JAVA_HOME}/bin/java"
+else
+  if [ `command -v java` ]; then
+    RUNNER="java"
+  else
+    echo "JAVA_HOME is not set" >&2
+    exit 1
+  fi
+fi
+
+# Find assembly jar
+SPARK_ASSEMBLY_JAR=
+if [ -f "$SPARK_HOME/RELEASE" ]; then
+  ASSEMBLY_DIR="$SPARK_HOME/lib"
+else
+  ASSEMBLY_DIR="$SPARK_HOME/assembly/target/scala-$SPARK_SCALA_VERSION"
+fi
+
+num_jars="$(ls -1 "$ASSEMBLY_DIR" | grep "^spark-assembly.*hadoop.*\.jar$" | wc -l)"
+if [ "$num_jars" -eq "0" -a -z "$SPARK_ASSEMBLY_JAR" ]; then
+  echo "Failed to find Spark assembly in $ASSEMBLY_DIR." 1>&2
+  echo "You need to build Spark before running this program." 1>&2
+  exit 1
+fi
+ASSEMBLY_JARS="$(ls -1 "$ASSEMBLY_DIR" | grep "^spark-assembly.*hadoop.*\.jar$" || true)"
+if [ "$num_jars" -gt "1" ]; then
+  echo "Found multiple Spark assembly jars in $ASSEMBLY_DIR:" 1>&2
+  echo "$ASSEMBLY_JARS" 1>&2
+  echo "Please remove all but one jar." 1>&2
+  exit 1
+fi
+
+SPARK_ASSEMBLY_JAR="${ASSEMBLY_DIR}/${ASSEMBLY_JARS}"
+
+LAUNCH_CLASSPATH="$SPARK_ASSEMBLY_JAR"
+
+# Add the launcher build dir to the classpath if requested.
+if [ -n "$SPARK_PREPEND_CLASSES" ]; then
+  LAUNCH_CLASSPATH="$SPARK_HOME/launcher/target/scala-$SPARK_SCALA_VERSION/classes:$LAUNCH_CLASSPATH"
+fi
+
+export _SPARK_ASSEMBLY="$SPARK_ASSEMBLY_JAR"
+
+echo $LAUNCH_CLASSPATH
+
+# The launcher library will print arguments separated by a NULL character, to allow arguments with
+# characters that would be otherwise interpreted by the shell. Read that in a while loop, populating
+# an array that will be used to exec the final command.
+#CMD=()
+#while IFS= read -d '' -r ARG; do
+#  CMD+=("$ARG")
+#done < <("$RUNNER" -cp "$LAUNCH_CLASSPATH" org.apache.spark.launcher.Main "$@")
+#exec "${CMD[@]}"

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0b8b90e/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 1c3b376..c8e6874 100644
--- a/pom.xml
+++ b/pom.xml
@@ -117,10 +117,10 @@
     <mscala.version>3.2.0</mscala.version>
     <hbase.version>1.0.0</hbase.version>
     <lucene.version>4.6.1</lucene.version>
-    <slf4j.version>1.7.10</slf4j.version>
+    <slf4j.version>1.7.12</slf4j.version>
     <scala.compat.version>2.10</scala.compat.version>
     <scala.version>2.10.4</scala.version>
-    <spark.version>1.3.1</spark.version>
+    <spark.version>1.4.1</spark.version>
     <h2o.version>0.1.25</h2o.version>
   </properties>
   <issueManagement>

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0b8b90e/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
----------------------------------------------------------------------
diff --git a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala b/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
index 8df93bd..4770cde 100644
--- a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
+++ b/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
@@ -157,7 +157,7 @@ class MahoutSparkILoop extends SparkILoop {
          _ __ ___   __ _| |__   ___  _   _| |_
         | '_ ` _ \ / _` | '_ \ / _ \| | | | __|
         | | | | | | (_| | | | | (_) | |_| | |_
-        |_| |_| |_|\__,_|_| |_|\___/ \__,_|\__|  version 0.11.0
+        |_| |_| |_|\__,_|_| |_|\___/ \__,_|\__|  version 0.11.1
 
       """)
     import Properties._