You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Chaozhong Yang (JIRA)" <ji...@apache.org> on 2015/01/06 09:33:34 UTC
[jira] [Issue Comment Deleted] (SPARK-4850) "GROUP BY" can't work if the schema of SchemaRDD contains struct or array type

     [ https://issues.apache.org/jira/browse/SPARK-4850?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Chaozhong Yang updated SPARK-4850:
----------------------------------
    Comment: was deleted

(was: https://issues.apache.org/jira/secure/ViewProfile.jspa?name=lian+cheng )

> "GROUP BY" can't work if the schema of SchemaRDD contains struct or array type
> ------------------------------------------------------------------------------
>
>                 Key: SPARK-4850
>                 URL: https://issues.apache.org/jira/browse/SPARK-4850
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 1.1.0, 1.1.1, 1.2.0, 1.1.2
>            Reporter: Chaozhong Yang
>            Assignee: Cheng Lian
>              Labels: group, sql
>   Original Estimate: 96h
>  Remaining Estimate: 96h
>
> Code in Spark Shell as follows:
> {code}
> val sqlContext = new org.apache.spark.sql.SQLContext(sc)
> val path = "path/to/json"
> sqlContext.jsonFile(path).register("Table")
> val t = sqlContext.sql("select * from Table group by a")
> t.collect
> {code}
> Let's look into the schema of `Table`
> {code}
> root
>  |-- a: integer (nullable = true)
>  |-- arr: array (nullable = true)
>  |    |-- element: integer (containsNull = false)
>  |-- createdAt: string (nullable = true)
>  |-- f: struct (nullable = true)
>  |    |-- __type: string (nullable = true)
>  |    |-- className: string (nullable = true)
>  |    |-- objectId: string (nullable = true)
>  |-- objectId: string (nullable = true)
>  |-- s: string (nullable = true)
>  |-- updatedAt: string (nullable = true)
> {code}
> Exception will be throwed:
> {code}
> org.apache.spark.sql.catalyst.errors.package$TreeNodeException: Expression not in GROUP BY: arr#9, tree:
> Aggregate [a#8], [a#8,arr#9,createdAt#10,f#11,objectId#12,s#13,updatedAt#14]
>  Subquery TestImport
>   LogicalRDD [a#8,arr#9,createdAt#10,f#11,objectId#12,s#13,updatedAt#14], MappedRDD[18] at map at JsonRDD.scala:47
> 	at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3$$anonfun$applyOrElse$7.apply(Analyzer.scala:126)
> 	at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3$$anonfun$applyOrElse$7.apply(Analyzer.scala:125)
> 	at scala.Option.foreach(Option.scala:236)
> 	at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3.applyOrElse(Analyzer.scala:125)
> 	at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3.applyOrElse(Analyzer.scala:108)
> 	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:144)
> 	at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:135)
> 	at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$.apply(Analyzer.scala:108)
> 	at org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$.apply(Analyzer.scala:106)
> 	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1$$anonfun$apply$2.apply(RuleExecutor.scala:61)
> 	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1$$anonfun$apply$2.apply(RuleExecutor.scala:59)
> 	at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:51)
> 	at scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:60)
> 	at scala.collection.mutable.WrappedArray.foldLeft(WrappedArray.scala:34)
> 	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1.apply(RuleExecutor.scala:59)
> 	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1.apply(RuleExecutor.scala:51)
> 	at scala.collection.immutable.List.foreach(List.scala:318)
> 	at org.apache.spark.sql.catalyst.rules.RuleExecutor.apply(RuleExecutor.scala:51)
> 	at org.apache.spark.sql.SQLContext$QueryExecution.analyzed$lzycompute(SQLContext.scala:411)
> 	at org.apache.spark.sql.SQLContext$QueryExecution.analyzed(SQLContext.scala:411)
> 	at org.apache.spark.sql.SQLContext$QueryExecution.withCachedData$lzycompute(SQLContext.scala:412)
> 	at org.apache.spark.sql.SQLContext$QueryExecution.withCachedData(SQLContext.scala:412)
> 	at org.apache.spark.sql.SQLContext$QueryExecution.optimizedPlan$lzycompute(SQLContext.scala:413)
> 	at org.apache.spark.sql.SQLContext$QueryExecution.optimizedPlan(SQLContext.scala:413)
> 	at org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan$lzycompute(SQLContext.scala:418)
> 	at org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan(SQLContext.scala:416)
> 	at org.apache.spark.sql.SQLContext$QueryExecution.executedPlan$lzycompute(SQLContext.scala:422)
> 	at org.apache.spark.sql.SQLContext$QueryExecution.executedPlan(SQLContext.scala:422)
> 	at org.apache.spark.sql.SchemaRDD.collect(SchemaRDD.scala:444)
> 	at $iwC$$iwC$$iwC$$iwC.<init>(<console>:17)
> 	at $iwC$$iwC$$iwC.<init>(<console>:22)
> 	at $iwC$$iwC.<init>(<console>:24)
> 	at $iwC.<init>(<console>:26)
> 	at <init>(<console>:28)
> 	at .<init>(<console>:32)
> 	at .<clinit>(<console>)
> 	at .<init>(<console>:7)
> 	at .<clinit>(<console>)
> 	at $print(<console>)
> 	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> 	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
> 	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
> 	at java.lang.reflect.Method.invoke(Method.java:597)
> 	at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)
> 	at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)
> 	at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)
> 	at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)
> 	at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)
> 	at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:828)
> 	at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:873)
> 	at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:785)
> 	at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:628)
> 	at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:636)
> 	at org.apache.spark.repl.SparkILoop.loop(SparkILoop.scala:641)
> 	at org.apache.spark.repl.SparkILoop$$anonfun$process$1.apply$mcZ$sp(SparkILoop.scala:968)
> 	at org.apache.spark.repl.SparkILoop$$anonfun$process$1.apply(SparkILoop.scala:916)
> 	at org.apache.spark.repl.SparkILoop$$anonfun$process$1.apply(SparkILoop.scala:916)
> 	at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
> 	at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:916)
> 	at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1011)
> 	at org.apache.spark.repl.Main$.main(Main.scala:31)
> 	at org.apache.spark.repl.Main.main(Main.scala)
> 	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> 	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
> 	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
> 	at java.lang.reflect.Method.invoke(Method.java:597)
> 	at org.apache.spark.deploy.SparkSubmit$.launch(SparkSubmit.scala:358)
> 	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:75)
> 	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
> {code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org