You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Michal Laclavik (JIRA)" <ji...@apache.org> on 2015/08/19 17:16:46 UTC
[jira] [Updated] (SPARK-10115) MLlib ALS training fails with
java.lang.ClassCastException
[ https://issues.apache.org/jira/browse/SPARK-10115?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Michal Laclavik updated SPARK-10115:
------------------------------------
Description:
I am running ALS collaborative filtering training on data which looks as follows (sample by running "user_product.take(10)":
{code}
[(1205640308657491975, 50233468418, 1.0),
(4743366459073625989, 50233472294, 1.0),
(4743366459073625989, 50233473253, 1.0),
(4743366459073625989, 75586230246, 1.0),
(4743366459073625989, 50233473248, 1.0),
(56766162624422850, 74848929776, 1.0),
(56766162624422850, 50233473397, 1.0),
(56766162624422850, 78185852309, 1.0),
(56766162624422850, 73533710263, 1.0),
(56766162624422850, 78185852319, 1.0)]
{code}
and then I call training on that RDD:
{code}
rank = 12
iterations=5
model = ALS.train(user_product, rank, iterations)
{code}
and I get following error:
{code}
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-54-4e711b94952d> in <module>()
2 rank = 12
3 iterations=5
----> 4 model = ALS.train(user_product, rank, iterations)
/opt/spark/python/pyspark/mllib/recommendation.py in train(cls, ratings, rank, iterations, lambda_, blocks, nonnegative, seed)
192 seed=None):
193 model = callMLlibFunc("trainALSModel", cls._prepare(ratings), rank, iterations,
--> 194 lambda_, blocks, nonnegative, seed)
195 return MatrixFactorizationModel(model)
196
/opt/spark/python/pyspark/mllib/common.py in callMLlibFunc(name, *args)
126 sc = SparkContext._active_spark_context
127 api = getattr(sc._jvm.PythonMLLibAPI(), name)
--> 128 return callJavaFunc(sc, api, *args)
129
130
/opt/spark/python/pyspark/mllib/common.py in callJavaFunc(sc, func, *args)
119 """ Call Java Function """
120 args = [_py2java(sc, a) for a in args]
--> 121 return _java2py(sc, func(*args))
122
123
/opt/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
/opt/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling o448.trainALSModel.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 9 in stage 57.0 failed 1 times, most recent failure: Lost task 9.0 in stage 57.0 (TID 4187, localhost): java.lang.ClassCastException
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
{code}
was:
I am running ALS collaborative filtering training on data which looks as follows (sample by running "user_product.take(10)":
{code}
[(1205640308657491975, 50233468418, 1.0),
(4743366459073625989, 50233472294, 1.0),
(4743366459073625989, 50233473253, 1.0),
(4743366459073625989, 75586230246, 1.0),
(4743366459073625989, 50233473248, 1.0),
(56766162624422850, 74848929776, 1.0),
(56766162624422850, 50233473397, 1.0),
(56766162624422850, 78185852309, 1.0),
(56766162624422850, 73533710263, 1.0),
(56766162624422850, 78185852319, 1.0)]
{code}
and then I call training on that RDD and I get following error:
{code}
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-54-4e711b94952d> in <module>()
2 rank = 12
3 iterations=5
----> 4 model = ALS.train(user_product, rank, iterations)
/opt/spark/python/pyspark/mllib/recommendation.py in train(cls, ratings, rank, iterations, lambda_, blocks, nonnegative, seed)
192 seed=None):
193 model = callMLlibFunc("trainALSModel", cls._prepare(ratings), rank, iterations,
--> 194 lambda_, blocks, nonnegative, seed)
195 return MatrixFactorizationModel(model)
196
/opt/spark/python/pyspark/mllib/common.py in callMLlibFunc(name, *args)
126 sc = SparkContext._active_spark_context
127 api = getattr(sc._jvm.PythonMLLibAPI(), name)
--> 128 return callJavaFunc(sc, api, *args)
129
130
/opt/spark/python/pyspark/mllib/common.py in callJavaFunc(sc, func, *args)
119 """ Call Java Function """
120 args = [_py2java(sc, a) for a in args]
--> 121 return _java2py(sc, func(*args))
122
123
/opt/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
/opt/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling o448.trainALSModel.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 9 in stage 57.0 failed 1 times, most recent failure: Lost task 9.0 in stage 57.0 (TID 4187, localhost): java.lang.ClassCastException
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
{code}
> MLlib ALS training fails with java.lang.ClassCastException
> ----------------------------------------------------------
>
> Key: SPARK-10115
> URL: https://issues.apache.org/jira/browse/SPARK-10115
> Project: Spark
> Issue Type: Bug
> Environment: first experienced on spark 1.2.1 but then also with latest
> spark-1.4.1-bin-hadoop2.6
> Reporter: Michal Laclavik
>
> I am running ALS collaborative filtering training on data which looks as follows (sample by running "user_product.take(10)":
> {code}
> [(1205640308657491975, 50233468418, 1.0),
> (4743366459073625989, 50233472294, 1.0),
> (4743366459073625989, 50233473253, 1.0),
> (4743366459073625989, 75586230246, 1.0),
> (4743366459073625989, 50233473248, 1.0),
> (56766162624422850, 74848929776, 1.0),
> (56766162624422850, 50233473397, 1.0),
> (56766162624422850, 78185852309, 1.0),
> (56766162624422850, 73533710263, 1.0),
> (56766162624422850, 78185852319, 1.0)]
> {code}
> and then I call training on that RDD:
> {code}
> rank = 12
> iterations=5
> model = ALS.train(user_product, rank, iterations)
> {code}
> and I get following error:
> {code}
> ---------------------------------------------------------------------------
> Py4JJavaError Traceback (most recent call last)
> <ipython-input-54-4e711b94952d> in <module>()
> 2 rank = 12
> 3 iterations=5
> ----> 4 model = ALS.train(user_product, rank, iterations)
> /opt/spark/python/pyspark/mllib/recommendation.py in train(cls, ratings, rank, iterations, lambda_, blocks, nonnegative, seed)
> 192 seed=None):
> 193 model = callMLlibFunc("trainALSModel", cls._prepare(ratings), rank, iterations,
> --> 194 lambda_, blocks, nonnegative, seed)
> 195 return MatrixFactorizationModel(model)
> 196
> /opt/spark/python/pyspark/mllib/common.py in callMLlibFunc(name, *args)
> 126 sc = SparkContext._active_spark_context
> 127 api = getattr(sc._jvm.PythonMLLibAPI(), name)
> --> 128 return callJavaFunc(sc, api, *args)
> 129
> 130
> /opt/spark/python/pyspark/mllib/common.py in callJavaFunc(sc, func, *args)
> 119 """ Call Java Function """
> 120 args = [_py2java(sc, a) for a in args]
> --> 121 return _java2py(sc, func(*args))
> 122
> 123
> /opt/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
> 536 answer = self.gateway_client.send_command(command)
> 537 return_value = get_return_value(answer, self.gateway_client,
> --> 538 self.target_id, self.name)
> 539
> 540 for temp_arg in temp_args:
> /opt/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
> 298 raise Py4JJavaError(
> 299 'An error occurred while calling {0}{1}{2}.\n'.
> --> 300 format(target_id, '.', name), value)
> 301 else:
> 302 raise Py4JError(
> Py4JJavaError: An error occurred while calling o448.trainALSModel.
> : org.apache.spark.SparkException: Job aborted due to stage failure: Task 9 in stage 57.0 failed 1 times, most recent failure: Lost task 9.0 in stage 57.0 (TID 4187, localhost): java.lang.ClassCastException
> Driver stacktrace:
> at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
> at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
> at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
> at scala.Option.foreach(Option.scala:236)
> at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
> {code}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org