You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@spark.apache.org by gtanguy <g....@gmail.com> on 2015/03/26 17:24:30 UTC

DataFrame GroupBy

Hello everybody,

I am trying to do a simple groupBy : 

*Code:*
val df  = hiveContext.sql("SELECT * FROM table1")
df .printSchema()
df .groupBy("customer_id").count().show(5)

*Stacktrace* :
root
 |-- customer_id: string (nullable = true)
 |-- rank: string (nullable = true)
 |-- reco_material_id: string (nullable = true)
 |-- score: string (nullable = true)
 |-- category: string (nullable = true)
 |-- is_achat: string (nullable = true)

15/03/26 17:19:29 INFO HiveMetaStore: 0: get_table : db=default tbl=table1
15/03/26 17:19:29 INFO audit: ugi=spark	ip=unknown-ip-addr	cmd=get_table :
db=default tbl=table1	
Exception in thread "main" java.util.NoSuchElementException: key not found:
customer_id#0
	at scala.collection.MapLike$class.default(MapLike.scala:228)
	at
org.apache.spark.sql.catalyst.expressions.AttributeMap.default(AttributeMap.scala:29)
	at scala.collection.MapLike$class.apply(MapLike.scala:141)
	at
org.apache.spark.sql.catalyst.expressions.AttributeMap.apply(AttributeMap.scala:29)
	at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
	at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
	at
scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:34)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
	at scala.collection.AbstractTraversable.map(Traversable.scala:105)
	at
org.apache.spark.sql.hive.execution.HiveTableScan.<init>(HiveTableScan.scala:53)
	at
org.apache.spark.sql.hive.HiveStrategies$HiveTableScans$$anonfun$15.apply(HiveStrategies.scala:216)
	at
org.apache.spark.sql.hive.HiveStrategies$HiveTableScans$$anonfun$15.apply(HiveStrategies.scala:216)
	at
org.apache.spark.sql.SQLContext$SparkPlanner.pruneFilterProject(SQLContext.scala:1034)
	at
org.apache.spark.sql.hive.HiveStrategies$HiveTableScans$.apply(HiveStrategies.scala:212)
	at
org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:58)
	at
org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:58)
	at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371)
	at
org.apache.spark.sql.catalyst.planning.QueryPlanner.apply(QueryPlanner.scala:59)
	at
org.apache.spark.sql.catalyst.planning.QueryPlanner.planLater(QueryPlanner.scala:54)
	at
org.apache.spark.sql.execution.SparkStrategies$HashAggregation$.apply(SparkStrategies.scala:152)
	at
org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:58)
	at
org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:58)
	at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371)
	at
org.apache.spark.sql.catalyst.planning.QueryPlanner.apply(QueryPlanner.scala:59)
	at
org.apache.spark.sql.catalyst.planning.QueryPlanner.planLater(QueryPlanner.scala:54)
	at
org.apache.spark.sql.execution.SparkStrategies$BasicOperators$.apply(SparkStrategies.scala:290)
	at
org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:58)
	at
org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:58)
	at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371)
	at
org.apache.spark.sql.catalyst.planning.QueryPlanner.apply(QueryPlanner.scala:59)
	at
org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan$lzycompute(SQLContext.scala:1081)
	at
org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan(SQLContext.scala:1079)
	at
org.apache.spark.sql.SQLContext$QueryExecution.executedPlan$lzycompute(SQLContext.scala:1085)
	at
org.apache.spark.sql.SQLContext$QueryExecution.executedPlan(SQLContext.scala:1085)
	at org.apache.spark.sql.DataFrame.collect(DataFrame.scala:815)
	at org.apache.spark.sql.DataFrame.head(DataFrame.scala:758)
	at org.apache.spark.sql.DataFrame.take(DataFrame.scala:809)
	at org.apache.spark.sql.DataFrame.showString(DataFrame.scala:178)
	at org.apache.spark.sql.DataFrame.show(DataFrame.scala:314)


Does anyone have an idea?

Regards,

Germain Tanguy.



--
View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/DataFrame-GroupBy-tp22242.html
Sent from the Apache Spark User List mailing list archive at Nabble.com.

---------------------------------------------------------------------
To unsubscribe, e-mail: user-unsubscribe@spark.apache.org
For additional commands, e-mail: user-help@spark.apache.org