You are viewing a plain text version of this content. The canonical link for it is here.

Posted to user@spark.apache.org by Sadhan Sood <sa...@gmail.com> on 2014/11/17 18:58:11 UTC

Exception in spark sql when running a group by query

While testing sparkSQL, we were running this group by with expression query
and got an exception. The same query worked fine on hive.

SELECT from_unixtime(floor(xyz.whenrequestreceived/1000.0 - 25200),
      'yyyy/MM/dd') as pst_date,
    count(*) as num_xyzs
  FROM
    all_matched_abc
  GROUP BY
    from_unixtime(floor(xyz.whenrequestreceived/1000.0 - 25200),
      'yyyy/MM/dd')

14/11/17 17:41:46 ERROR thriftserver.SparkSQLDriver: Failed in [SELECT
from_unixtime(floor(xyz.whenrequestreceived/1000.0 - 25200),
      'yyyy/MM/dd') as pst_date,
    count(*) as num_xyzs
  FROM
    all_matched_abc
  GROUP BY
    from_unixtime(floor(xyz.whenrequestreceived/1000.0 - 25200),
      'yyyy/MM/dd')
]
org.apache.spark.sql.catalyst.errors.package$TreeNodeException:
Expression not in GROUP BY:
HiveSimpleUdf#org.apache.hadoop.hive.ql.udf.UDFFromUnixTime(HiveGenericUdf#org.apache.hadoop.hive.ql.udf.generic.GenericUDFFloor(((CAST(xyz#183.whenrequestreceived
AS whenrequestreceived#187L, DoubleType) / 1000.0) - CAST(25200,
DoubleType))),yyyy/MM/dd) AS pst_date#179, tree:

Aggregate [HiveSimpleUdf#org.apache.hadoop.hive.ql.udf.UDFFromUnixTime(HiveGenericUdf#org.apache.hadoop.hive.ql.udf.generic.GenericUDFFloor(((CAST(xyz#183.whenrequestreceived,
DoubleType) / 1000.0) - CAST(25200, DoubleType))),yyyy/MM/dd)],
[HiveSimpleUdf#org.apache.hadoop.hive.ql.udf.UDFFromUnixTime(HiveGenericUdf#org.apache.hadoop.hive.ql.udf.generic.GenericUDFFloor(((CAST(xyz#183.whenrequestreceived
AS whenrequestreceived#187L, DoubleType) / 1000.0) - CAST(25200,
DoubleType))),yyyy/MM/dd) AS pst_date#179,COUNT(1) AS num_xyzs#180L]

 MetastoreRelation default, all_matched_abc, None
        at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3$$anonfun$applyOrElse$6.apply(Analyzer.scala:127)
        at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3$$anonfun$applyOrElse$6.apply(Analyzer.scala:125)
        at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
        at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
        at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3.applyOrElse(Analyzer.scala:125)
        at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3.applyOrElse(Analyzer.scala:115)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:144)
        at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:135)
        at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$.apply(Analyzer.scala:115)
        at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$.apply(Analyzer.scala:113)
        at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1$$anonfun$apply$2.apply(RuleExecutor.scala:61)
        at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1$$anonfun$apply$2.apply(RuleExecutor.scala:59)
        at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:51)
        at scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:60)
        at scala.collection.mutable.WrappedArray.foldLeft(WrappedArray.scala:34)
        at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1.apply(RuleExecutor.scala:59)
        at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1.apply(RuleExecutor.scala:51)
        at scala.collection.immutable.List.foreach(List.scala:318)
        at org.apache.spark.sql.catalyst.rules.RuleExecutor.apply(RuleExecutor.scala:51)
        at org.apache.spark.sql.SQLContext$QueryExecution.analyzed$lzycompute(SQLContext.scala:411)
        at org.apache.spark.sql.SQLContext$QueryExecution.analyzed(SQLContext.scala:411)
        at org.apache.spark.sql.SQLContext$QueryExecution.withCachedData$lzycompute(SQLContext.scala:412)
        at org.apache.spark.sql.SQLContext$QueryExecution.withCachedData(SQLContext.scala:412)
        at org.apache.spark.sql.SQLContext$QueryExecution.optimizedPlan$lzycompute(SQLContext.scala:413)
        at org.apache.spark.sql.SQLContext$QueryExecution.optimizedPlan(SQLContext.scala:413)
        at org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan$lzycompute(SQLContext.scala:418)
        at org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan(SQLContext.scala:416)
        at org.apache.spark.sql.SQLContext$QueryExecution.executedPlan$lzycompute(SQLContext.scala:422)
        at org.apache.spark.sql.SQLContext$QueryExecution.executedPlan(SQLContext.scala:422)
        at org.apache.spark.sql.hive.HiveContext$QueryExecution.stringResult(HiveContext.scala:425)
        at org.apache.spark.sql.hive.thriftserver.AbstractSparkSQLDriver.run(AbstractSparkSQLDriver.scala:59)
        at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:276)
        at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:423)
        at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver$.main(SparkSQLCLIDriver.scala:211)
        at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.main(SparkSQLCLIDriver.scala)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:483)
        at org.apache.spark.deploy.SparkSubmit$.launch(SparkSubmit.scala:353)
        at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:75)
        at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)

Re: Exception in spark sql when running a group by query

Posted by Sadhan Sood <sa...@gmail.com>.

ah makes sense - Thanks Michael!

On Mon, Nov 17, 2014 at 6:08 PM, Michael Armbrust <mi...@databricks.com>
wrote:

> You are perhaps hitting an issue that was fixed by #3248
> <https://github.com/apache/spark/pull/3248>?
>
> On Mon, Nov 17, 2014 at 9:58 AM, Sadhan Sood <sa...@gmail.com>
> wrote:
>
>> While testing sparkSQL, we were running this group by with expression
>> query and got an exception. The same query worked fine on hive.
>>
>> SELECT from_unixtime(floor(xyz.whenrequestreceived/1000.0 - 25200),
>>       'yyyy/MM/dd') as pst_date,
>>     count(*) as num_xyzs
>>   FROM
>>     all_matched_abc
>>   GROUP BY
>>     from_unixtime(floor(xyz.whenrequestreceived/1000.0 - 25200),
>>       'yyyy/MM/dd')
>>
>> 14/11/17 17:41:46 ERROR thriftserver.SparkSQLDriver: Failed in [SELECT from_unixtime(floor(xyz.whenrequestreceived/1000.0 - 25200),
>>       'yyyy/MM/dd') as pst_date,
>>     count(*) as num_xyzs
>>   FROM
>>     all_matched_abc
>>   GROUP BY
>>     from_unixtime(floor(xyz.whenrequestreceived/1000.0 - 25200),
>>       'yyyy/MM/dd')
>> ]
>> org.apache.spark.sql.catalyst.errors.package$TreeNodeException: Expression not in GROUP BY: HiveSimpleUdf#org.apache.hadoop.hive.ql.udf.UDFFromUnixTime(HiveGenericUdf#org.apache.hadoop.hive.ql.udf.generic.GenericUDFFloor(((CAST(xyz#183.whenrequestreceived AS whenrequestreceived#187L, DoubleType) / 1000.0) - CAST(25200, DoubleType))),yyyy/MM/dd) AS pst_date#179, tree:
>>
>> Aggregate [HiveSimpleUdf#org.apache.hadoop.hive.ql.udf.UDFFromUnixTime(HiveGenericUdf#org.apache.hadoop.hive.ql.udf.generic.GenericUDFFloor(((CAST(xyz#183.whenrequestreceived, DoubleType) / 1000.0) - CAST(25200, DoubleType))),yyyy/MM/dd)], [HiveSimpleUdf#org.apache.hadoop.hive.ql.udf.UDFFromUnixTime(HiveGenericUdf#org.apache.hadoop.hive.ql.udf.generic.GenericUDFFloor(((CAST(xyz#183.whenrequestreceived AS whenrequestreceived#187L, DoubleType) / 1000.0) - CAST(25200, DoubleType))),yyyy/MM/dd) AS pst_date#179,COUNT(1) AS num_xyzs#180L]
>>
>>  MetastoreRelation default, all_matched_abc, None
>>         at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3$$anonfun$applyOrElse$6.apply(Analyzer.scala:127)
>>         at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3$$anonfun$applyOrElse$6.apply(Analyzer.scala:125)
>>         at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
>>         at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
>>         at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3.applyOrElse(Analyzer.scala:125)
>>         at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3.applyOrElse(Analyzer.scala:115)
>>         at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:144)
>>         at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:135)
>>         at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$.apply(Analyzer.scala:115)
>>         at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$.apply(Analyzer.scala:113)
>>         at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1$$anonfun$apply$2.apply(RuleExecutor.scala:61)
>>         at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1$$anonfun$apply$2.apply(RuleExecutor.scala:59)
>>         at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:51)
>>         at scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:60)
>>         at scala.collection.mutable.WrappedArray.foldLeft(WrappedArray.scala:34)
>>         at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1.apply(RuleExecutor.scala:59)
>>         at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1.apply(RuleExecutor.scala:51)
>>         at scala.collection.immutable.List.foreach(List.scala:318)
>>         at org.apache.spark.sql.catalyst.rules.RuleExecutor.apply(RuleExecutor.scala:51)
>>         at org.apache.spark.sql.SQLContext$QueryExecution.analyzed$lzycompute(SQLContext.scala:411)
>>         at org.apache.spark.sql.SQLContext$QueryExecution.analyzed(SQLContext.scala:411)
>>         at org.apache.spark.sql.SQLContext$QueryExecution.withCachedData$lzycompute(SQLContext.scala:412)
>>         at org.apache.spark.sql.SQLContext$QueryExecution.withCachedData(SQLContext.scala:412)
>>         at org.apache.spark.sql.SQLContext$QueryExecution.optimizedPlan$lzycompute(SQLContext.scala:413)
>>         at org.apache.spark.sql.SQLContext$QueryExecution.optimizedPlan(SQLContext.scala:413)
>>         at org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan$lzycompute(SQLContext.scala:418)
>>         at org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan(SQLContext.scala:416)
>>         at org.apache.spark.sql.SQLContext$QueryExecution.executedPlan$lzycompute(SQLContext.scala:422)
>>         at org.apache.spark.sql.SQLContext$QueryExecution.executedPlan(SQLContext.scala:422)
>>         at org.apache.spark.sql.hive.HiveContext$QueryExecution.stringResult(HiveContext.scala:425)
>>         at org.apache.spark.sql.hive.thriftserver.AbstractSparkSQLDriver.run(AbstractSparkSQLDriver.scala:59)
>>         at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:276)
>>         at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:423)
>>         at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver$.main(SparkSQLCLIDriver.scala:211)
>>         at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.main(SparkSQLCLIDriver.scala)
>>         at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
>>         at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
>>         at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>>         at java.lang.reflect.Method.invoke(Method.java:483)
>>         at org.apache.spark.deploy.SparkSubmit$.launch(SparkSubmit.scala:353)
>>         at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:75)
>>         at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
>>
>>
>

Re: Exception in spark sql when running a group by query

Posted by Michael Armbrust <mi...@databricks.com>.

You are perhaps hitting an issue that was fixed by #3248
<https://github.com/apache/spark/pull/3248>?

On Mon, Nov 17, 2014 at 9:58 AM, Sadhan Sood <sa...@gmail.com> wrote:

> While testing sparkSQL, we were running this group by with expression
> query and got an exception. The same query worked fine on hive.
>
> SELECT from_unixtime(floor(xyz.whenrequestreceived/1000.0 - 25200),
>       'yyyy/MM/dd') as pst_date,
>     count(*) as num_xyzs
>   FROM
>     all_matched_abc
>   GROUP BY
>     from_unixtime(floor(xyz.whenrequestreceived/1000.0 - 25200),
>       'yyyy/MM/dd')
>
> 14/11/17 17:41:46 ERROR thriftserver.SparkSQLDriver: Failed in [SELECT from_unixtime(floor(xyz.whenrequestreceived/1000.0 - 25200),
>       'yyyy/MM/dd') as pst_date,
>     count(*) as num_xyzs
>   FROM
>     all_matched_abc
>   GROUP BY
>     from_unixtime(floor(xyz.whenrequestreceived/1000.0 - 25200),
>       'yyyy/MM/dd')
> ]
> org.apache.spark.sql.catalyst.errors.package$TreeNodeException: Expression not in GROUP BY: HiveSimpleUdf#org.apache.hadoop.hive.ql.udf.UDFFromUnixTime(HiveGenericUdf#org.apache.hadoop.hive.ql.udf.generic.GenericUDFFloor(((CAST(xyz#183.whenrequestreceived AS whenrequestreceived#187L, DoubleType) / 1000.0) - CAST(25200, DoubleType))),yyyy/MM/dd) AS pst_date#179, tree:
>
> Aggregate [HiveSimpleUdf#org.apache.hadoop.hive.ql.udf.UDFFromUnixTime(HiveGenericUdf#org.apache.hadoop.hive.ql.udf.generic.GenericUDFFloor(((CAST(xyz#183.whenrequestreceived, DoubleType) / 1000.0) - CAST(25200, DoubleType))),yyyy/MM/dd)], [HiveSimpleUdf#org.apache.hadoop.hive.ql.udf.UDFFromUnixTime(HiveGenericUdf#org.apache.hadoop.hive.ql.udf.generic.GenericUDFFloor(((CAST(xyz#183.whenrequestreceived AS whenrequestreceived#187L, DoubleType) / 1000.0) - CAST(25200, DoubleType))),yyyy/MM/dd) AS pst_date#179,COUNT(1) AS num_xyzs#180L]
>
>  MetastoreRelation default, all_matched_abc, None
>         at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3$$anonfun$applyOrElse$6.apply(Analyzer.scala:127)
>         at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3$$anonfun$applyOrElse$6.apply(Analyzer.scala:125)
>         at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
>         at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
>         at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3.applyOrElse(Analyzer.scala:125)
>         at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3.applyOrElse(Analyzer.scala:115)
>         at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:144)
>         at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:135)
>         at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$.apply(Analyzer.scala:115)
>         at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$.apply(Analyzer.scala:113)
>         at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1$$anonfun$apply$2.apply(RuleExecutor.scala:61)
>         at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1$$anonfun$apply$2.apply(RuleExecutor.scala:59)
>         at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:51)
>         at scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:60)
>         at scala.collection.mutable.WrappedArray.foldLeft(WrappedArray.scala:34)
>         at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1.apply(RuleExecutor.scala:59)
>         at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1.apply(RuleExecutor.scala:51)
>         at scala.collection.immutable.List.foreach(List.scala:318)
>         at org.apache.spark.sql.catalyst.rules.RuleExecutor.apply(RuleExecutor.scala:51)
>         at org.apache.spark.sql.SQLContext$QueryExecution.analyzed$lzycompute(SQLContext.scala:411)
>         at org.apache.spark.sql.SQLContext$QueryExecution.analyzed(SQLContext.scala:411)
>         at org.apache.spark.sql.SQLContext$QueryExecution.withCachedData$lzycompute(SQLContext.scala:412)
>         at org.apache.spark.sql.SQLContext$QueryExecution.withCachedData(SQLContext.scala:412)
>         at org.apache.spark.sql.SQLContext$QueryExecution.optimizedPlan$lzycompute(SQLContext.scala:413)
>         at org.apache.spark.sql.SQLContext$QueryExecution.optimizedPlan(SQLContext.scala:413)
>         at org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan$lzycompute(SQLContext.scala:418)
>         at org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan(SQLContext.scala:416)
>         at org.apache.spark.sql.SQLContext$QueryExecution.executedPlan$lzycompute(SQLContext.scala:422)
>         at org.apache.spark.sql.SQLContext$QueryExecution.executedPlan(SQLContext.scala:422)
>         at org.apache.spark.sql.hive.HiveContext$QueryExecution.stringResult(HiveContext.scala:425)
>         at org.apache.spark.sql.hive.thriftserver.AbstractSparkSQLDriver.run(AbstractSparkSQLDriver.scala:59)
>         at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:276)
>         at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:423)
>         at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver$.main(SparkSQLCLIDriver.scala:211)
>         at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.main(SparkSQLCLIDriver.scala)
>         at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
>         at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
>         at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>         at java.lang.reflect.Method.invoke(Method.java:483)
>         at org.apache.spark.deploy.SparkSubmit$.launch(SparkSubmit.scala:353)
>         at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:75)
>         at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
>
>