You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@mahout.apache.org by "Stefan Goldener (Jira)" <ji...@apache.org> on 2020/02/27 15:46:00 UTC

[jira] [Updated] (MAHOUT-2093) Mahout Source Broken

     [ https://issues.apache.org/jira/browse/MAHOUT-2093?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Stefan Goldener updated MAHOUT-2093:
------------------------------------
    Description: 
Seems like newer versions of Mahout do have problems with spark bindings e.g. mahout spark-itemsimilarity or mahout spark-rowsimilarity do not work due to class not found exceptions. 
{code:java}
Error: Could not find or load main class org.apache.mahout.drivers.RowSimilarityDriver
{code}
{code:java}
Error: Could not find or load main class org.apache.mahout.drivers.ItemSimilarityDriver
{code}
whereas *mahout spark-shell* works flawlessly.

Here is a short Dockerfile to show the issue:
{code:yaml}
FROM openjdk:8-alpine
ENV spark_uid=185
ENV SCALA_MAJOR=2.11
ENV SCALA_MAJOR_MINOR=2.11.12
ENV HADOOP_MAJOR=2.7
ENV SPARK_MAJOR_MINOR=2.4.5
ENV MAHOUT_MAJOR_MINOR=0.14.0
ENV MAHOUT_VERSION=mahout-${MAHOUT_MAJOR_MINOR}
ENV MAHOUT_BASE=/opt/mahout
ENV MAHOUT_HOME=${MAHOUT_BASE}/${MAHOUT_VERSION}
ENV SPARK_VERSION=spark-${SPARK_MAJOR_MINOR}
ENV SPARK_BASE=/opt/spark
ENV SPARK_HOME=${SPARK_BASE}/${SPARK_VERSION}
ENV MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g"
ENV SPARK_SRC_URL="https://archive.apache.org/dist/spark/${SPARK_VERSION}/${SPARK_VERSION}.tgz"
ENV MAHOUT_SRC_URL="https://archive.apache.org/dist/mahout/${MAHOUT_MAJOR_MINOR}/mahout-${MAHOUT_MAJOR_MINOR}-source-release.zip"
ENV ZINC_PORT=3030

### build spark
RUN set -ex && \
    apk upgrade --no-cache && \
    ln -s /lib /lib64 && \
    apk add --no-cache bash python py-pip tini libc6-compat linux-pam krb5 krb5-libs nss curl openssl git maven && \
    pip install setuptools && \
    mkdir -p ${MAHOUT_HOME} && \
    mkdir -p ${SPARK_BASE} && \
    curl  -LfsS ${SPARK_SRC_URL} -o ${SPARK_HOME}.tgz  && \
    tar -xzvf ${SPARK_HOME}.tgz -C ${SPARK_BASE}/ && \
    rm ${SPARK_HOME}.tgz && \
    export PATH=$PATH:$MAHOUT_HOME/bin:$MAHOUT_HOME/lib:$SPARK_HOME/bin:$JAVA_HOME/bin && \
    bash ${SPARK_HOME}/dev/change-scala-version.sh ${SCALA_MAJOR} && \
    bash ${SPARK_HOME}/dev/make-distribution.sh --name ${DATE}-${REVISION} --pip --tgz -DzincPort=${ZINC_PORT} \
            -Phadoop-${HADOOP_MAJOR} -Pkubernetes -Pkinesis-asl -Phive -Phive-thriftserver -Pscala-${SCALA_MAJOR}
    
### build mahout
RUN curl -LfsS $MAHOUT_SRC_URL -o ${MAHOUT_BASE}.zip  && \
    unzip ${MAHOUT_BASE}.zip -d ${MAHOUT_BASE} && \ 
    rm ${MAHOUT_BASE}.zip && \
    cd ${MAHOUT_HOME} && \
    mvn -Dspark.version=${SPARK_MAJOR_MINOR} -Dscala.version=${SCALA_MAJOR_MINOR} -Dscala.compat.version=${SCALA_MAJOR} -DskipTests -Dmaven.javadoc.skip=true clean package 
{code}
docker build . -t mahout-test
 docker run -it mahout-test /bin/bash

  was:
Seems like newer versions of Mahout do have problems with spark bindings e.g. mahout spark-itemsimilarity or mahout spark-rowsimilarity do not work due to class not found exceptions. 
{code:java}
Error: Could not find or load main class org.apache.mahout.drivers.RowSimilarityDriver
{code}
{code:java}
Error: Could not find or load main class org.apache.mahout.drivers.ItemSimilarityDriver
{code}
whereas *mahout spark-shell* works flawlessly.

Here is a short Dockerfile to show the issue:
{code:yaml}
FROM openjdk:8-alpine
ENV spark_uid=185
ENV SCALA_MAJOR=2.11
ENV SCALA_MAJOR_MINOR=2.11.12
ENV HADOOP_MAJOR=2.7
ENV SPARK_MAJOR_MINOR=2.4.5
ENV MAHOUT_MAJOR_MINOR=0.14.0
ENV MAHOUT_VERSION=mahout-${MAHOUT_MAJOR_MINOR}
ENV MAHOUT_BASE=/opt/mahout
ENV MAHOUT_HOME=${MAHOUT_BASE}/${MAHOUT_VERSION}
ENV SPARK_VERSION=spark-${SPARK_MAJOR_MINOR}
ENV SPARK_BASE=/opt/spark
ENV SPARK_HOME=${SPARK_BASE}/${SPARK_VERSION}
ENV MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g"
ENV SPARK_SRC_URL="https://archive.apache.org/dist/spark/${SPARK_VERSION}/${SPARK_VERSION}.tgz"
ENV MAHOUT_SRC_URL="https://archive.apache.org/dist/mahout/${MAHOUT_MAJOR_MINOR}/mahout-${MAHOUT_MAJOR_MINOR}-source-release.zip"
ENV ZINC_PORT=3030### build spark
RUN set -ex && \
    apk upgrade --no-cache && \
    ln -s /lib /lib64 && \
    apk add --no-cache bash python py-pip tini libc6-compat linux-pam krb5 krb5-libs nss curl openssl git maven && \
    pip install setuptools && \
    mkdir -p ${MAHOUT_HOME} && \
    #mkdir -p /opt/mahout/examples && \
    #mkdir -p /opt/mahout/work-dir && \
    mkdir -p ${SPARK_BASE} && \
    curl  -LfsS ${SPARK_SRC_URL} -o ${SPARK_HOME}.tgz  && \
    tar -xzvf ${SPARK_HOME}.tgz -C ${SPARK_BASE}/ && \
    rm ${SPARK_HOME}.tgz && \
    export PATH=$PATH:$MAHOUT_HOME/bin:$MAHOUT_HOME/lib:$SPARK_HOME/bin:$JAVA_HOME/bin && \
    bash ${SPARK_HOME}/dev/change-scala-version.sh ${SCALA_MAJOR} && \
    bash ${SPARK_HOME}/dev/make-distribution.sh --name ${DATE}-${REVISION} --pip --tgz -DzincPort=${ZINC_PORT} \
            -Phadoop-${HADOOP_MAJOR} -Pkubernetes -Pkinesis-asl -Phive -Phive-thriftserver -Pscala-${SCALA_MAJOR}
    #bash ${SPARK_HOME}/build/mvn -Pkubernetes -Pscala-${SCALA_MAJOR} -DskipTests clean package
### build mahout
RUN curl -LfsS $MAHOUT_SRC_URL -o ${MAHOUT_BASE}.zip  && \
    unzip ${MAHOUT_BASE}.zip -d ${MAHOUT_BASE} && \ 
    rm ${MAHOUT_BASE}.zip && \
    cd ${MAHOUT_HOME} && \
    mvn -Dspark.version=${SPARK_MAJOR_MINOR} -Dscala.version=${SCALA_MAJOR_MINOR} -Dscala.compat.version=${SCALA_MAJOR} -DskipTests -Dmaven.javadoc.skip=true clean package 
{code}
docker build . -t mahout-test
 docker run -it mahout-test /bin/bash


> Mahout Source Broken
> --------------------
>
>                 Key: MAHOUT-2093
>                 URL: https://issues.apache.org/jira/browse/MAHOUT-2093
>             Project: Mahout
>          Issue Type: Bug
>          Components: Algorithms, Collaborative Filtering, Documentation
>    Affects Versions: 0.14.0, 0.13.2
>            Reporter: Stefan Goldener
>            Priority: Blocker
>
> Seems like newer versions of Mahout do have problems with spark bindings e.g. mahout spark-itemsimilarity or mahout spark-rowsimilarity do not work due to class not found exceptions. 
> {code:java}
> Error: Could not find or load main class org.apache.mahout.drivers.RowSimilarityDriver
> {code}
> {code:java}
> Error: Could not find or load main class org.apache.mahout.drivers.ItemSimilarityDriver
> {code}
> whereas *mahout spark-shell* works flawlessly.
> Here is a short Dockerfile to show the issue:
> {code:yaml}
> FROM openjdk:8-alpine
> ENV spark_uid=185
> ENV SCALA_MAJOR=2.11
> ENV SCALA_MAJOR_MINOR=2.11.12
> ENV HADOOP_MAJOR=2.7
> ENV SPARK_MAJOR_MINOR=2.4.5
> ENV MAHOUT_MAJOR_MINOR=0.14.0
> ENV MAHOUT_VERSION=mahout-${MAHOUT_MAJOR_MINOR}
> ENV MAHOUT_BASE=/opt/mahout
> ENV MAHOUT_HOME=${MAHOUT_BASE}/${MAHOUT_VERSION}
> ENV SPARK_VERSION=spark-${SPARK_MAJOR_MINOR}
> ENV SPARK_BASE=/opt/spark
> ENV SPARK_HOME=${SPARK_BASE}/${SPARK_VERSION}
> ENV MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g"
> ENV SPARK_SRC_URL="https://archive.apache.org/dist/spark/${SPARK_VERSION}/${SPARK_VERSION}.tgz"
> ENV MAHOUT_SRC_URL="https://archive.apache.org/dist/mahout/${MAHOUT_MAJOR_MINOR}/mahout-${MAHOUT_MAJOR_MINOR}-source-release.zip"
> ENV ZINC_PORT=3030
> ### build spark
> RUN set -ex && \
>     apk upgrade --no-cache && \
>     ln -s /lib /lib64 && \
>     apk add --no-cache bash python py-pip tini libc6-compat linux-pam krb5 krb5-libs nss curl openssl git maven && \
>     pip install setuptools && \
>     mkdir -p ${MAHOUT_HOME} && \
>     mkdir -p ${SPARK_BASE} && \
>     curl  -LfsS ${SPARK_SRC_URL} -o ${SPARK_HOME}.tgz  && \
>     tar -xzvf ${SPARK_HOME}.tgz -C ${SPARK_BASE}/ && \
>     rm ${SPARK_HOME}.tgz && \
>     export PATH=$PATH:$MAHOUT_HOME/bin:$MAHOUT_HOME/lib:$SPARK_HOME/bin:$JAVA_HOME/bin && \
>     bash ${SPARK_HOME}/dev/change-scala-version.sh ${SCALA_MAJOR} && \
>     bash ${SPARK_HOME}/dev/make-distribution.sh --name ${DATE}-${REVISION} --pip --tgz -DzincPort=${ZINC_PORT} \
>             -Phadoop-${HADOOP_MAJOR} -Pkubernetes -Pkinesis-asl -Phive -Phive-thriftserver -Pscala-${SCALA_MAJOR}
>     
> ### build mahout
> RUN curl -LfsS $MAHOUT_SRC_URL -o ${MAHOUT_BASE}.zip  && \
>     unzip ${MAHOUT_BASE}.zip -d ${MAHOUT_BASE} && \ 
>     rm ${MAHOUT_BASE}.zip && \
>     cd ${MAHOUT_HOME} && \
>     mvn -Dspark.version=${SPARK_MAJOR_MINOR} -Dscala.version=${SCALA_MAJOR_MINOR} -Dscala.compat.version=${SCALA_MAJOR} -DskipTests -Dmaven.javadoc.skip=true clean package 
> {code}
> docker build . -t mahout-test
>  docker run -it mahout-test /bin/bash



--
This message was sent by Atlassian Jira
(v8.3.4#803005)