You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@spark.apache.org by Jeff Thompson <je...@gmail.com> on 2015/10/09 05:47:50 UTC
error in sparkSQL 1.5 using count(1) in nested queries
After upgrading from 1.4.1 to 1.5.1 I found some of my spark SQL queries no
longer worked. Seems to be related to using count(1) or count(*) in a
nested query. I can reproduce the issue in a pyspark shell with the sample
code below. The ‘people’ table is from spark-1.5.1-bin-hadoop2.4/
examples/src/main/resources/people.json.
Environment details: Hadoop 2.5.0-cdh5.3.0, YARN
*Test code:*
from pyspark.sql import SQLContext
print(sc.version)
sqlContext = SQLContext(sc)
df = sqlContext.read.json("/user/thj1pal/people.json")
df.show()
sqlContext.registerDataFrameAsTable(df,"PEOPLE")
result = sqlContext.sql("SELECT MIN(t0.age) FROM (SELECT * FROM PEOPLE
WHERE age > 0) t0 HAVING(COUNT(1) > 0)")
result.show()
*spark 1.4.1 output*
1.4.1
+----+-------+
| age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
+--+
|c0|
+--+
|19|
+--+
*spark 1.5.1 output*
1.5.1
+----+-------+
| age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-1-342b585498f7> in <module>()
9
10 result = sqlContext.sql("SELECT MIN(t0.age) FROM (SELECT * FROM
PEOPLE WHERE age > 0) t0 HAVING(COUNT(1) > 0)")
---> 11 result.show()
/home/thj1pal/spark-1.5.1-bin-hadoop2.4/python/pyspark/sql/dataframe.pyc in
show(self, n, truncate)
254 +---+-----+
255 """
--> 256 print(self._jdf.showString(n, truncate))
257
258 def __repr__(self):
/home/thj1pal/spark-1.5.1-bin-hadoop2.4/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py
in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
/home/thj1pal/spark-1.5.1-bin-hadoop2.4/python/pyspark/sql/utils.pyc in
deco(*a, **kw)
34 def deco(*a, **kw):
35 try:
---> 36 return f(*a, **kw)
37 except py4j.protocol.Py4JJavaError as e:
38 s = e.java_exception.toString()
/home/thj1pal/spark-1.5.1-bin-hadoop2.4/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py
in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling o33.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
in stage 4.0 failed 4 times, most recent failure: Lost task 0.3 in stage
4.0 (TID 9, pal-bd-n06-ib): java.lang.UnsupportedOperationException: Cannot
evaluate expression: count(1)
at
org.apache.spark.sql.catalyst.expressions.Unevaluable$class.eval(Expression.scala:188)
at
org.apache.spark.sql.catalyst.expressions.Count.eval(aggregates.scala:156)
at
org.apache.spark.sql.catalyst.expressions.BinaryExpression.eval(Expression.scala:327)
….
Re: error in sparkSQL 1.5 using count(1) in nested queries
Posted by Michael Armbrust <mi...@databricks.com>.
Thanks for reporting: https://issues.apache.org/jira/browse/SPARK-11032
You can probably workaround this by aliasing the count and just doing a
filter on that value afterwards.
On Thu, Oct 8, 2015 at 8:47 PM, Jeff Thompson <
jeffreykeatingthompson@gmail.com> wrote:
> After upgrading from 1.4.1 to 1.5.1 I found some of my spark SQL queries
> no longer worked. Seems to be related to using count(1) or count(*) in a
> nested query. I can reproduce the issue in a pyspark shell with the sample
> code below. The ‘people’ table is from spark-1.5.1-bin-hadoop2.4/
> examples/src/main/resources/people.json.
>
> Environment details: Hadoop 2.5.0-cdh5.3.0, YARN
>
> *Test code:*
>
> from pyspark.sql import SQLContext
> print(sc.version)
> sqlContext = SQLContext(sc)
>
> df = sqlContext.read.json("/user/thj1pal/people.json")
> df.show()
>
> sqlContext.registerDataFrameAsTable(df,"PEOPLE")
>
> result = sqlContext.sql("SELECT MIN(t0.age) FROM (SELECT * FROM PEOPLE
> WHERE age > 0) t0 HAVING(COUNT(1) > 0)")
> result.show()
>
> *spark 1.4.1 output*
>
> 1.4.1
> +----+-------+
> | age| name|
> +----+-------+
> |null|Michael|
> | 30| Andy|
> | 19| Justin|
> +----+-------+
>
> +--+
> |c0|
> +--+
> |19|
> +--+
>
>
> *spark 1.5.1 output*
>
> 1.5.1
> +----+-------+
> | age| name|
> +----+-------+
> |null|Michael|
> | 30| Andy|
> | 19| Justin|
> +----+-------+
>
> ---------------------------------------------------------------------------
> Py4JJavaError Traceback (most recent call last)
> <ipython-input-1-342b585498f7> in <module>()
> 9
> 10 result = sqlContext.sql("SELECT MIN(t0.age) FROM (SELECT *
> FROM PEOPLE WHERE age > 0) t0 HAVING(COUNT(1) > 0)")
> ---> 11 result.show()
>
> /home/thj1pal/spark-1.5.1-bin-hadoop2.4/python/pyspark/sql/dataframe.pyc
> in show(self, n, truncate)
> 254 +---+-----+
> 255 """
> --> 256 print(self._jdf.showString(n, truncate))
> 257
> 258 def __repr__(self):
>
> /home/thj1pal/spark-1.5.1-bin-hadoop2.4/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py
> in __call__(self, *args)
> 536 answer = self.gateway_client.send_command(command)
> 537 return_value = get_return_value(answer,
> self.gateway_client,
> --> 538 self.target_id, self.name)
> 539
> 540 for temp_arg in temp_args:
>
> /home/thj1pal/spark-1.5.1-bin-hadoop2.4/python/pyspark/sql/utils.pyc in
> deco(*a, **kw)
> 34 def deco(*a, **kw):
> 35 try:
> ---> 36 return f(*a, **kw)
> 37 except py4j.protocol.Py4JJavaError as e:
> 38 s = e.java_exception.toString()
>
> /home/thj1pal/spark-1.5.1-bin-hadoop2.4/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py
> in get_return_value(answer, gateway_client, target_id, name)
> 298 raise Py4JJavaError(
> 299 'An error occurred while calling {0}{1}{2}.\n'.
> --> 300 format(target_id, '.', name), value)
> 301 else:
> 302 raise Py4JError(
>
> Py4JJavaError: An error occurred while calling o33.showString.
> : org.apache.spark.SparkException: Job aborted due to stage failure: Task
> 0 in stage 4.0 failed 4 times, most recent failure: Lost task 0.3 in stage
> 4.0 (TID 9, pal-bd-n06-ib): java.lang.UnsupportedOperationException: Cannot
> evaluate expression: count(1)
> at
> org.apache.spark.sql.catalyst.expressions.Unevaluable$class.eval(Expression.scala:188)
> at
> org.apache.spark.sql.catalyst.expressions.Count.eval(aggregates.scala:156)
> at
> org.apache.spark.sql.catalyst.expressions.BinaryExpression.eval(Expression.scala:327)
> ….
>