You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Patrick Wendell (JIRA)" <ji...@apache.org> on 2014/10/17 23:13:34 UTC
[jira] [Resolved] (SPARK-3855) Binding Exception when running
PythonUDFs
[ https://issues.apache.org/jira/browse/SPARK-3855?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Patrick Wendell resolved SPARK-3855.
------------------------------------
Resolution: Fixed
Fix Version/s: 1.2.0
> Binding Exception when running PythonUDFs
> -----------------------------------------
>
> Key: SPARK-3855
> URL: https://issues.apache.org/jira/browse/SPARK-3855
> Project: Spark
> Issue Type: Bug
> Components: PySpark, SQL
> Affects Versions: 1.1.0
> Reporter: Michael Armbrust
> Assignee: Michael Armbrust
> Fix For: 1.2.0
>
>
> {code}
> from pyspark import *
> from pyspark.sql import *
> sc = SparkContext()
> sqlContext = SQLContext(sc)
> sqlContext.registerFunction("strlen", lambda string: len(string))
> sqlContext.inferSchema(sc.parallelize([Row(a="test")])).registerTempTable("test")
> srdd = sqlContext.sql("SELECT strlen(a) FROM test WHERE strlen(a) > 1")
> print srdd._jschema_rdd.baseSchemaRDD().queryExecution().toString()
> print srdd.collect()
> {code}
> output:
> {code}
> == Parsed Logical Plan ==
> Project ['strlen('a) AS c0#1]
> Filter ('strlen('a) > 1)
> UnresolvedRelation None, test, None
> == Analyzed Logical Plan ==
> Project [c0#1]
> Project [pythonUDF#2 AS c0#1]
> EvaluatePython PythonUDF#strlen(a#0)
> Project [a#0]
> Filter (CAST(pythonUDF#3, DoubleType) > CAST(1, DoubleType))
> EvaluatePython PythonUDF#strlen(a#0)
> SparkLogicalPlan (ExistingRdd [a#0], MapPartitionsRDD[7] at mapPartitions at SQLContext.scala:525)
> == Optimized Logical Plan ==
> Project [pythonUDF#2 AS c0#1]
> EvaluatePython PythonUDF#strlen(a#0)
> Project [a#0]
> Filter (CAST(pythonUDF#3, DoubleType) > 1.0)
> EvaluatePython PythonUDF#strlen(a#0)
> SparkLogicalPlan (ExistingRdd [a#0], MapPartitionsRDD[7] at mapPartitions at SQLContext.scala:525)
> == Physical Plan ==
> Project [pythonUDF#2 AS c0#1]
> BatchPythonEvaluation PythonUDF#strlen(a#0), [a#0,pythonUDF#5]
> Project [a#0]
> Filter (CAST(pythonUDF#3, DoubleType) > 1.0)
> BatchPythonEvaluation PythonUDF#strlen(a#0), [a#0,pythonUDF#3]
> ExistingRdd [a#0], MapPartitionsRDD[7] at mapPartitions at SQLContext.scala:525
> Code Generation: false
> == RDD ==
> 14/10/08 15:03:00 ERROR Executor: Exception in task 1.0 in stage 4.0 (TID 9)
> org.apache.spark.sql.catalyst.errors.package$TreeNodeException: Binding attribute, tree: pythonUDF#2
> at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:47)
> at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:47)
> at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:46)
> at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:144)
> at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:162)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
> at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
> at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
> at scala.collection.AbstractIterator.to(Iterator.scala:1157)
> at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
> at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
> at org.apache.spark.sql.catalyst.trees.TreeNode.transformChildrenDown(TreeNode.scala:191)
> at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:147)
> at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:135)
> at org.apache.spark.sql.catalyst.expressions.BindReferences$.bindReference(BoundAttribute.scala:46)
> at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection$$anonfun$$init$$2.apply(Projection.scala:52)
> at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection$$anonfun$$init$$2.apply(Projection.scala:52)
> at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
> at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
> at scala.collection.immutable.List.foreach(List.scala:318)
> at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
> at scala.collection.AbstractTraversable.map(Traversable.scala:105)
> at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.<init>(Projection.scala:52)
> at org.apache.spark.sql.execution.SparkPlan$$anonfun$newMutableProjection$1.apply(SparkPlan.scala:106)
> at org.apache.spark.sql.execution.SparkPlan$$anonfun$newMutableProjection$1.apply(SparkPlan.scala:106)
> at org.apache.spark.sql.execution.Project$$anonfun$1.apply(basicOperators.scala:43)
> at org.apache.spark.sql.execution.Project$$anonfun$1.apply(basicOperators.scala:42)
> at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:599)
> at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:599)
> at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
> at org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
> at org.apache.spark.scheduler.Task.run(Task.scala:56)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:182)
> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
> at java.lang.Thread.run(Thread.java:745)
> Caused by: java.lang.RuntimeException: Couldn't find pythonUDF#2 in [a#0,pythonUDF#5]
> at scala.sys.package$.error(package.scala:27)
> at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyOrElse$1.apply(BoundAttribute.scala:53)
> at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyOrElse$1.apply(BoundAttribute.scala:47)
> at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:46)
> ... 46 more
> 14/10/08 15:03:00 ERROR Executor: Exception in task 0.0 in stage 4.0 (TID 8)
> org.apache.spark.sql.catalyst.errors.package$TreeNodeException: Binding attribute, tree: pythonUDF#2
> at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:47)
> at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:47)
> at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:46)
> at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:144)
> at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:162)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
> at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
> at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
> at scala.collection.AbstractIterator.to(Iterator.scala:1157)
> at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
> at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
> at org.apache.spark.sql.catalyst.trees.TreeNode.transformChildrenDown(TreeNode.scala:191)
> at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:147)
> at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:135)
> at org.apache.spark.sql.catalyst.expressions.BindReferences$.bindReference(BoundAttribute.scala:46)
> at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection$$anonfun$$init$$2.apply(Projection.scala:52)
> at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection$$anonfun$$init$$2.apply(Projection.scala:52)
> at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
> at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
> at scala.collection.immutable.List.foreach(List.scala:318)
> at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
> at scala.collection.AbstractTraversable.map(Traversable.scala:105)
> at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.<init>(Projection.scala:52)
> at org.apache.spark.sql.execution.SparkPlan$$anonfun$newMutableProjection$1.apply(SparkPlan.scala:106)
> at org.apache.spark.sql.execution.SparkPlan$$anonfun$newMutableProjection$1.apply(SparkPlan.scala:106)
> at org.apache.spark.sql.execution.Project$$anonfun$1.apply(basicOperators.scala:43)
> at org.apache.spark.sql.execution.Project$$anonfun$1.apply(basicOperators.scala:42)
> at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:599)
> at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:599)
> at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
> at org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
> at org.apache.spark.scheduler.Task.run(Task.scala:56)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:182)
> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
> at java.lang.Thread.run(Thread.java:745)
> Caused by: java.lang.RuntimeException: Couldn't find pythonUDF#2 in [a#0,pythonUDF#5]
> at scala.sys.package$.error(package.scala:27)
> at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyOrElse$1.apply(BoundAttribute.scala:53)
> at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyOrElse$1.apply(BoundAttribute.scala:47)
> at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:46)
> ... 46 more
> {code}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org