You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@spark.apache.org by Jerry <je...@gmail.com> on 2015/08/14 18:50:59 UTC
Another issue with using lag and lead with data frames
So it seems like dataframes aren't going give me a break and just work. Now
it evaluates but goes nuts if it runs into a null case OR doesn't know how
to get the correct data type when I specify the default value as a string
expression. Let me know if anyone has a work around to this. PLEASE HELP
ME!!! THIS IS DRIVING ME NUTS! Below is what I used:
JSON:
{"A":"a"},
{"A":"c"},
{"A":"B"},
{"A":"d"},
{"A":"A"},
{"A":null}
Reading json:
df = sqlContext.jsonFile("/home/........./Desktop/trash.json")
CASE 1 (no default):
*$ dfb = df.selectExpr("lag(A,1)")$ dfb.show()*
Java.lang.NullPointerException
at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:662)
15/08/14 09:17:29 WARN TaskSetManager: Lost task 0.0 in stage 15.0 (TID 19,
localhost): java.lang.NullPointerException
at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:662)
15/08/14 09:17:29 ERROR TaskSetManager: Task 0 in stage 15.0 failed 1
times; aborting job
15/08/14 09:17:29 INFO TaskSchedulerImpl: Removed TaskSet 15.0, whose tasks
have all completed, from pool
15/08/14 09:17:29 INFO TaskSchedulerImpl: Cancelling stage 15
15/08/14 09:17:29 INFO DAGScheduler: ResultStage 15 (show at <console>:24)
failed in 0.069 s
15/08/14 09:17:29 INFO DAGScheduler: Job 15 failed: show at <console>:24,
took 0.112457 s
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage
15.0 (TID 19, localhost): java.lang.NullPointerException
at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:662)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org
$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at scala.Option.foreach(Option.scala:236)
at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
CASE 2 (with default):
*$ dfb = df.selectExpr("lag(A,1,'x')")$ dfb.show()*
java.lang.ClassCastException: java.lang.String cannot be cast to
org.apache.hadoop.io.Text
at
org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
at
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
at
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:662)
15/08/14 09:16:09 WARN TaskSetManager: Lost task 0.0 in stage 14.0 (TID 18,
localhost): java.lang.ClassCastException: java.lang.String cannot be cast
to org.apache.hadoop.io.Text
at
org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
at
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
at
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:662)
15/08/14 09:16:09 ERROR TaskSetManager: Task 0 in stage 14.0 failed 1
times; aborting job
15/08/14 09:16:09 INFO TaskSchedulerImpl: Removed TaskSet 14.0, whose tasks
have all completed, from pool
15/08/14 09:16:09 INFO TaskSchedulerImpl: Cancelling stage 14
15/08/14 09:16:09 INFO DAGScheduler: ResultStage 14 (show at <console>:24)
failed in 0.082 s
15/08/14 09:16:09 INFO DAGScheduler: Job 14 failed: show at <console>:24,
took 0.137699 s
15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_25_piece0 on
localhost:33504 in memory (size: 3.2 KB, free: 264.9 MB)
15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_27_piece0 on
localhost:33504 in memory (size: 3.2 KB, free: 264.9 MB)
15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_26_piece0 on
localhost:33504 in memory (size: 19.3 KB, free: 264.9 MB)
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
in stage 14.0 failed 1 times, most recent failure: Lost task 0.0 in stage
14.0 (TID 18, localhost): java.lang.ClassCastException: java.lang.String
cannot be cast to org.apache.hadoop.io.Text
at
org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
at
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
at
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:662)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org
$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at scala.Option.foreach(Option.scala:236)
at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
scala> dfb = df.selectExpr("lag(A,1)")
dfb: org.apache.spark.sql.DataFrame = ['lag(A,1): string]
scala> dfb.show()
15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(243712) called with
curMem=645979, maxMem=277842493
15/08/14 09:17:29 INFO MemoryStore: Block broadcast_28 stored as values in
memory (estimated size 238.0 KB, free 264.1 MB)
15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(19750) called with
curMem=889691, maxMem=277842493
15/08/14 09:17:29 INFO MemoryStore: Block broadcast_28_piece0 stored as
bytes in memory (estimated size 19.3 KB, free 264.1 MB)
15/08/14 09:17:29 INFO BlockManagerInfo: Added broadcast_28_piece0 in
memory on localhost:33504 (size: 19.3 KB, free: 264.9 MB)
15/08/14 09:17:29 INFO SparkContext: Created broadcast 28 from show at
<console>:24
15/08/14 09:17:29 INFO FileInputFormat: Total input paths to process : 1
15/08/14 09:17:29 INFO SparkContext: Starting job: show at <console>:24
15/08/14 09:17:29 INFO DAGScheduler: Got job 15 (show at <console>:24) with
1 output partitions (allowLocal=false)
15/08/14 09:17:29 INFO DAGScheduler: Final stage: ResultStage 15(show at
<console>:24)
15/08/14 09:17:29 INFO DAGScheduler: Parents of final stage: List()
15/08/14 09:17:29 INFO DAGScheduler: Missing parents: List()
15/08/14 09:17:29 INFO DAGScheduler: Submitting ResultStage 15
(MapPartitionsRDD[61] at show at <console>:24), which has no missing parents
15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(5936) called with
curMem=909441, maxMem=277842493
15/08/14 09:17:29 INFO MemoryStore: Block broadcast_29 stored as values in
memory (estimated size 5.8 KB, free 264.1 MB)
15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(3259) called with
curMem=915377, maxMem=277842493
15/08/14 09:17:29 INFO MemoryStore: Block broadcast_29_piece0 stored as
bytes in memory (estimated size 3.2 KB, free 264.1 MB)
15/08/14 09:17:29 INFO BlockManagerInfo: Added broadcast_29_piece0 in
memory on localhost:33504 (size: 3.2 KB, free: 264.9 MB)
15/08/14 09:17:29 INFO SparkContext: Created broadcast 29 from broadcast at
DAGScheduler.scala:874
15/08/14 09:17:29 INFO DAGScheduler: Submitting 1 missing tasks from
ResultStage 15 (MapPartitionsRDD[61] at show at <console>:24)
15/08/14 09:17:29 INFO TaskSchedulerImpl: Adding task set 15.0 with 1 tasks
15/08/14 09:17:29 INFO TaskSetManager: Starting task 0.0 in stage 15.0 (TID
19, localhost, PROCESS_LOCAL, 1409 bytes)
15/08/14 09:17:29 INFO Executor: Running task 0.0 in stage 15.0 (TID 19)
15/08/14 09:17:29 INFO HadoopRDD: Input split:
file:/home/adminz/Desktop/trash.json:0+33
15/08/14 09:17:29 ERROR Executor: Exception in task 0.0 in stage 15.0 (TID
19)
java.lang.NullPointerException
at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:662)
15/08/14 09:17:29 WARN TaskSetManager: Lost task 0.0 in stage 15.0 (TID 19,
localhost): java.lang.NullPointerException
at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:662)
15/08/14 09:17:29 ERROR TaskSetManager: Task 0 in stage 15.0 failed 1
times; aborting job
15/08/14 09:17:29 INFO TaskSchedulerImpl: Removed TaskSet 15.0, whose tasks
have all completed, from pool
15/08/14 09:17:29 INFO TaskSchedulerImpl: Cancelling stage 15
15/08/14 09:17:29 INFO DAGScheduler: ResultStage 15 (show at <console>:24)
failed in 0.069 s
15/08/14 09:17:29 INFO DAGScheduler: Job 15 failed: show at <console>:24,
took 0.112457 s
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage
15.0 (TID 19, localhost): java.lang.NullPointerException
at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:662)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org
$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at scala.Option.foreach(Option.scala:236)
at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
Re: Another issue with using lag and lead with data frames
Posted by Jerry <je...@gmail.com>.
Still not cooperating...
lag(A,1,'X') OVER (ORDER BY A) as LA
^
at scala.sys.package$.error(package.scala:27)
at
org.apache.spark.sql.catalyst.SqlParser.parseExpression(SqlParser.scala:45)
at
org.apache.spark.sql.DataFrame$$anonfun$selectExpr$1.apply(DataFrame.scala:626)
at
org.apache.spark.sql.DataFrame$$anonfun$selectExpr$1.apply(DataFrame.scala:625)
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at
scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:34)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
at scala.collection.AbstractTraversable.map(Traversable.scala:105)
at org.apache.spark.sql.DataFrame.selectExpr(DataFrame.scala:625)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:21)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:26)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:28)
at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:30)
at $iwC$$iwC$$iwC$$iwC.<init>(<console>:32)
at $iwC$$iwC$$iwC.<init>(<console>:34)
at $iwC$$iwC.<init>(<console>:36)
at $iwC.<init>(<console>:38)
at <init>(<console>:40)
at .<init>(<console>:44)
at .<clinit>(<console>)
at .<init>(<console>:7)
at .<clinit>(<console>)
at $print(<console>)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
at java.lang.reflect.Method.invoke(Method.java:597)
at
org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065)
at
org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1338)
at
org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:840)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:871)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:819)
at
org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:857)
at
org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902)
at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:814)
at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657)
at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665)
at org.apache.spark.repl.SparkILoop.org
$apache$spark$repl$SparkILoop$$loop(SparkILoop.scala:670)
at
org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply$mcZ$sp(SparkILoop.scala:997)
at
org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
at
org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
at
scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
at org.apache.spark.repl.SparkILoop.org
$apache$spark$repl$SparkILoop$$process(SparkILoop.scala:945)
at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1059)
at org.apache.spark.repl.Main$.main(Main.scala:31)
at org.apache.spark.repl.Main.main(Main.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
at java.lang.reflect.Method.invoke(Method.java:597)
at
org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:665)
at
org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:170)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:193)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:112)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
On Fri, Aug 14, 2015 at 1:39 PM, Jerry <je...@gmail.com> wrote:
> Hi Salih,
> Normally I do sort before performing that operation, but since I've been
> trying to get this working for a week, I'm just loading something simple to
> test if lag works. Earlier I was having DB issues.... so it's been a long
> run of solving one runtime exception after another. Hopefully those links
> point me to something useful. Let me know if you can run the above code/
> what you did different to get that code to run.
>
> Thanks,
> Jerry
>
> On Fri, Aug 14, 2015 at 1:23 PM, Salih Oztop <so...@yahoo.com> wrote:
>
>> Hi Jerry,
>> This blog post is perfect for window functions in Spark.
>>
>> https://databricks.com/blog/2015/07/15/introducing-window-functions-in-spark-sql.html
>> and a generic sql usage from oracle-base blog.
>> https://oracle-base.com/articles/misc/lag-lead-analytic-functions
>>
>> It seems you are not using Window part for Order By clause.
>>
>> Kind Regards
>> Salih Oztop
>>
>> ------------------------------
>> *From:* Jerry <je...@gmail.com>
>> *To:* user <us...@spark.apache.org>
>> *Sent:* Friday, August 14, 2015 5:50 PM
>> *Subject:* Another issue with using lag and lead with data frames
>>
>> So it seems like dataframes aren't going give me a break and just work.
>> Now it evaluates but goes nuts if it runs into a null case OR doesn't know
>> how to get the correct data type when I specify the default value as a
>> string expression. Let me know if anyone has a work around to this. PLEASE
>> HELP ME!!! THIS IS DRIVING ME NUTS! Below is what I used:
>>
>> JSON:
>> {"A":"a"},
>> {"A":"c"},
>> {"A":"B"},
>> {"A":"d"},
>> {"A":"A"},
>> {"A":null}
>> Reading json:
>> df = sqlContext.jsonFile("/home/........./Desktop/trash.json")
>>
>>
>> CASE 1 (no default):
>>
>> *$ dfb = df.selectExpr("lag(A,1)")$ dfb.show()*
>> Java.lang.NullPointerException
>> at
>> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
>> at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>> at
>> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>> at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>> at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>> at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>> at
>> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>> at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>> at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>> at scala.collection.TraversableOnce$class.to
>> (TraversableOnce.scala:273)
>> at scala.collection.AbstractIterator.to
>> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>> at
>> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>> at
>> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>> at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>> at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>> at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>> at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>> at org.apache.spark.scheduler.Task.run(Task.scala:70)
>> at
>> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>> at
>> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>> at
>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>> at java.lang.Thread.run(Thread.java:662)
>> 15/08/14 09:17:29 WARN TaskSetManager: Lost task 0.0 in stage 15.0 (TID
>> 19, localhost): java.lang.NullPointerException
>> at
>> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
>> at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>> at
>> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>> at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>> at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>> at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>> at
>> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>> at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>> at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>> at scala.collection.TraversableOnce$class.to
>> (TraversableOnce.scala:273)
>> at scala.collection.AbstractIterator.to
>> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>> at
>> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>> at
>> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>> at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>> at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>> at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>> at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>> at org.apache.spark.scheduler.Task.run(Task.scala:70)
>> at
>> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>> at
>> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>> at
>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>> at java.lang.Thread.run(Thread.java:662)
>>
>> 15/08/14 09:17:29 ERROR TaskSetManager: Task 0 in stage 15.0 failed 1
>> times; aborting job
>> 15/08/14 09:17:29 INFO TaskSchedulerImpl: Removed TaskSet 15.0, whose
>> tasks have all completed, from pool
>> 15/08/14 09:17:29 INFO TaskSchedulerImpl: Cancelling stage 15
>> 15/08/14 09:17:29 INFO DAGScheduler: ResultStage 15 (show at
>> <console>:24) failed in 0.069 s
>> 15/08/14 09:17:29 INFO DAGScheduler: Job 15 failed: show at <console>:24,
>> took 0.112457 s
>> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
>> in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage
>> 15.0 (TID 19, localhost): java.lang.NullPointerException
>> at
>> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
>> at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>> at
>> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>> at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>> at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>> at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>> at
>> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>> at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>> at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>> at scala.collection.TraversableOnce$class.to
>> (TraversableOnce.scala:273)
>> at scala.collection.AbstractIterator.to
>> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>> at
>> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>> at
>> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>> at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>> at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>> at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>> at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>> at org.apache.spark.scheduler.Task.run(Task.scala:70)
>> at
>> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>> at
>> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>> at
>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>> at java.lang.Thread.run(Thread.java:662)
>>
>> Driver stacktrace:
>> at org.apache.spark.scheduler.DAGScheduler.org
>> <http://org.apache.spark.scheduler.dagscheduler.org/>
>> $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
>> at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
>> at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
>> at
>> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
>> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
>> at
>> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
>> at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
>> at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
>> at scala.Option.foreach(Option.scala:236)
>> at
>> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
>> at
>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
>> at
>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
>> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
>>
>>
>> CASE 2 (with default):
>>
>> *$ dfb = df.selectExpr("lag(A,1,'x')")$ dfb.show()*
>>
>> java.lang.ClassCastException: java.lang.String cannot be cast to
>> org.apache.hadoop.io.Text
>> at
>> org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
>> at
>> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
>> at
>> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
>> at
>> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
>> at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>> at
>> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>> at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>> at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>> at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>> at
>> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>> at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>> at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>> at scala.collection.TraversableOnce$class.to
>> (TraversableOnce.scala:273)
>> at scala.collection.AbstractIterator.to
>> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>> at
>> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>> at
>> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>> at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>> at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>> at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>> at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>> at org.apache.spark.scheduler.Task.run(Task.scala:70)
>> at
>> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>> at
>> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>> at
>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>> at java.lang.Thread.run(Thread.java:662)
>> 15/08/14 09:16:09 WARN TaskSetManager: Lost task 0.0 in stage 14.0 (TID
>> 18, localhost): java.lang.ClassCastException: java.lang.String cannot be
>> cast to org.apache.hadoop.io.Text
>> at
>> org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
>> at
>> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
>> at
>> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
>> at
>> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
>> at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>> at
>> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>> at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>> at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>> at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>> at
>> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>> at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>> at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>> at scala.collection.TraversableOnce$class.to
>> (TraversableOnce.scala:273)
>> at scala.collection.AbstractIterator.to
>> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>> at
>> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>> at
>> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>> at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>> at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>> at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>> at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>> at org.apache.spark.scheduler.Task.run(Task.scala:70)
>> at
>> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>> at
>> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>> at
>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>> at java.lang.Thread.run(Thread.java:662)
>>
>> 15/08/14 09:16:09 ERROR TaskSetManager: Task 0 in stage 14.0 failed 1
>> times; aborting job
>> 15/08/14 09:16:09 INFO TaskSchedulerImpl: Removed TaskSet 14.0, whose
>> tasks have all completed, from pool
>> 15/08/14 09:16:09 INFO TaskSchedulerImpl: Cancelling stage 14
>> 15/08/14 09:16:09 INFO DAGScheduler: ResultStage 14 (show at
>> <console>:24) failed in 0.082 s
>> 15/08/14 09:16:09 INFO DAGScheduler: Job 14 failed: show at <console>:24,
>> took 0.137699 s
>> 15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_25_piece0 on
>> localhost:33504 in memory (size: 3.2 KB, free: 264.9 MB)
>> 15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_27_piece0 on
>> localhost:33504 in memory (size: 3.2 KB, free: 264.9 MB)
>> 15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_26_piece0 on
>> localhost:33504 in memory (size: 19.3 KB, free: 264.9 MB)
>> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
>> in stage 14.0 failed 1 times, most recent failure: Lost task 0.0 in stage
>> 14.0 (TID 18, localhost): java.lang.ClassCastException: java.lang.String
>> cannot be cast to org.apache.hadoop.io.Text
>> at
>> org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
>> at
>> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
>> at
>> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
>> at
>> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
>> at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>> at
>> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>> at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>> at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>> at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>> at
>> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>> at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>> at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>> at scala.collection.TraversableOnce$class.to
>> (TraversableOnce.scala:273)
>> at scala.collection.AbstractIterator.to
>> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>> at
>> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>> at
>> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>> at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>> at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>> at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>> at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>> at org.apache.spark.scheduler.Task.run(Task.scala:70)
>> at
>> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>> at
>> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>> at
>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>> at java.lang.Thread.run(Thread.java:662)
>>
>> Driver stacktrace:
>> at org.apache.spark.scheduler.DAGScheduler.org
>> <http://org.apache.spark.scheduler.dagscheduler.org/>
>> $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
>> at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
>> at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
>> at
>> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
>> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
>> at
>> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
>> at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
>> at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
>> at scala.Option.foreach(Option.scala:236)
>> at
>> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
>> at
>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
>> at
>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
>> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
>>
>>
>> scala> dfb = df.selectExpr("lag(A,1)")
>> dfb: org.apache.spark.sql.DataFrame = ['lag(A,1): string]
>>
>> scala> dfb.show()
>> 15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(243712) called with
>> curMem=645979, maxMem=277842493
>> 15/08/14 09:17:29 INFO MemoryStore: Block broadcast_28 stored as values
>> in memory (estimated size 238.0 KB, free 264.1 MB)
>> 15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(19750) called with
>> curMem=889691, maxMem=277842493
>> 15/08/14 09:17:29 INFO MemoryStore: Block broadcast_28_piece0 stored as
>> bytes in memory (estimated size 19.3 KB, free 264.1 MB)
>> 15/08/14 09:17:29 INFO BlockManagerInfo: Added broadcast_28_piece0 in
>> memory on localhost:33504 (size: 19.3 KB, free: 264.9 MB)
>> 15/08/14 09:17:29 INFO SparkContext: Created broadcast 28 from show at
>> <console>:24
>> 15/08/14 09:17:29 INFO FileInputFormat: Total input paths to process : 1
>> 15/08/14 09:17:29 INFO SparkContext: Starting job: show at <console>:24
>> 15/08/14 09:17:29 INFO DAGScheduler: Got job 15 (show at <console>:24)
>> with 1 output partitions (allowLocal=false)
>> 15/08/14 09:17:29 INFO DAGScheduler: Final stage: ResultStage 15(show at
>> <console>:24)
>> 15/08/14 09:17:29 INFO DAGScheduler: Parents of final stage: List()
>> 15/08/14 09:17:29 INFO DAGScheduler: Missing parents: List()
>> 15/08/14 09:17:29 INFO DAGScheduler: Submitting ResultStage 15
>> (MapPartitionsRDD[61] at show at <console>:24), which has no missing parents
>> 15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(5936) called with
>> curMem=909441, maxMem=277842493
>> 15/08/14 09:17:29 INFO MemoryStore: Block broadcast_29 stored as values
>> in memory (estimated size 5.8 KB, free 264.1 MB)
>> 15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(3259) called with
>> curMem=915377, maxMem=277842493
>> 15/08/14 09:17:29 INFO MemoryStore: Block broadcast_29_piece0 stored as
>> bytes in memory (estimated size 3.2 KB, free 264.1 MB)
>> 15/08/14 09:17:29 INFO BlockManagerInfo: Added broadcast_29_piece0 in
>> memory on localhost:33504 (size: 3.2 KB, free: 264.9 MB)
>> 15/08/14 09:17:29 INFO SparkContext: Created broadcast 29 from broadcast
>> at DAGScheduler.scala:874
>> 15/08/14 09:17:29 INFO DAGScheduler: Submitting 1 missing tasks from
>> ResultStage 15 (MapPartitionsRDD[61] at show at <console>:24)
>> 15/08/14 09:17:29 INFO TaskSchedulerImpl: Adding task set 15.0 with 1
>> tasks
>> 15/08/14 09:17:29 INFO TaskSetManager: Starting task 0.0 in stage 15.0
>> (TID 19, localhost, PROCESS_LOCAL, 1409 bytes)
>> 15/08/14 09:17:29 INFO Executor: Running task 0.0 in stage 15.0 (TID 19)
>> 15/08/14 09:17:29 INFO HadoopRDD: Input split:
>> file:/home/adminz/Desktop/trash.json:0+33
>> 15/08/14 09:17:29 ERROR Executor: Exception in task 0.0 in stage 15.0
>> (TID 19)
>> java.lang.NullPointerException
>> at
>> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
>> at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>> at
>> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>> at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>> at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>> at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>> at
>> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>> at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>> at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>> at scala.collection.TraversableOnce$class.to
>> (TraversableOnce.scala:273)
>> at scala.collection.AbstractIterator.to
>> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>> at
>> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>> at
>> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>> at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>> at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>> at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>> at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>> at org.apache.spark.scheduler.Task.run(Task.scala:70)
>> at
>> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>> at
>> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>> at
>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>> at java.lang.Thread.run(Thread.java:662)
>> 15/08/14 09:17:29 WARN TaskSetManager: Lost task 0.0 in stage 15.0 (TID
>> 19, localhost): java.lang.NullPointerException
>> at
>> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
>> at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>> at
>> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>> at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>> at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>> at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>> at
>> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>> at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>> at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>> at scala.collection.TraversableOnce$class.to
>> (TraversableOnce.scala:273)
>> at scala.collection.AbstractIterator.to
>> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>> at
>> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>> at
>> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>> at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>> at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>> at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>> at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>> at org.apache.spark.scheduler.Task.run(Task.scala:70)
>> at
>> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>> at
>> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>> at
>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>> at java.lang.Thread.run(Thread.java:662)
>>
>> 15/08/14 09:17:29 ERROR TaskSetManager: Task 0 in stage 15.0 failed 1
>> times; aborting job
>> 15/08/14 09:17:29 INFO TaskSchedulerImpl: Removed TaskSet 15.0, whose
>> tasks have all completed, from pool
>> 15/08/14 09:17:29 INFO TaskSchedulerImpl: Cancelling stage 15
>> 15/08/14 09:17:29 INFO DAGScheduler: ResultStage 15 (show at
>> <console>:24) failed in 0.069 s
>> 15/08/14 09:17:29 INFO DAGScheduler: Job 15 failed: show at <console>:24,
>> took 0.112457 s
>> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
>> in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage
>> 15.0 (TID 19, localhost): java.lang.NullPointerException
>> at
>> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
>> at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>> at
>> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>> at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>> at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>> at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>> at
>> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>> at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>> at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>> at scala.collection.TraversableOnce$class.to
>> (TraversableOnce.scala:273)
>> at scala.collection.AbstractIterator.to
>> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>> at
>> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>> at
>> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>> at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>> at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>> at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>> at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>> at org.apache.spark.scheduler.Task.run(Task.scala:70)
>> at
>> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>> at
>> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>> at
>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>> at java.lang.Thread.run(Thread.java:662)
>>
>> Driver stacktrace:
>> at org.apache.spark.scheduler.DAGScheduler.org
>> <http://org.apache.spark.scheduler.dagscheduler.org/>
>> $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
>> at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
>> at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
>> at
>> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
>> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
>> at
>> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
>> at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
>> at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
>> at scala.Option.foreach(Option.scala:236)
>> at
>> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
>> at
>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
>> at
>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
>> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
>>
>>
>>
>>
>>
>>
>>
>
Re: Another issue with using lag and lead with data frames
Posted by Jerry <je...@gmail.com>.
Hi Salih,
Normally I do sort before performing that operation, but since I've been
trying to get this working for a week, I'm just loading something simple to
test if lag works. Earlier I was having DB issues.... so it's been a long
run of solving one runtime exception after another. Hopefully those links
point me to something useful. Let me know if you can run the above code/
what you did different to get that code to run.
Thanks,
Jerry
On Fri, Aug 14, 2015 at 1:23 PM, Salih Oztop <so...@yahoo.com> wrote:
> Hi Jerry,
> This blog post is perfect for window functions in Spark.
>
> https://databricks.com/blog/2015/07/15/introducing-window-functions-in-spark-sql.html
> and a generic sql usage from oracle-base blog.
> https://oracle-base.com/articles/misc/lag-lead-analytic-functions
>
> It seems you are not using Window part for Order By clause.
>
> Kind Regards
> Salih Oztop
>
> ------------------------------
> *From:* Jerry <je...@gmail.com>
> *To:* user <us...@spark.apache.org>
> *Sent:* Friday, August 14, 2015 5:50 PM
> *Subject:* Another issue with using lag and lead with data frames
>
> So it seems like dataframes aren't going give me a break and just work.
> Now it evaluates but goes nuts if it runs into a null case OR doesn't know
> how to get the correct data type when I specify the default value as a
> string expression. Let me know if anyone has a work around to this. PLEASE
> HELP ME!!! THIS IS DRIVING ME NUTS! Below is what I used:
>
> JSON:
> {"A":"a"},
> {"A":"c"},
> {"A":"B"},
> {"A":"d"},
> {"A":"A"},
> {"A":null}
> Reading json:
> df = sqlContext.jsonFile("/home/........./Desktop/trash.json")
>
>
> CASE 1 (no default):
>
> *$ dfb = df.selectExpr("lag(A,1)")$ dfb.show()*
> Java.lang.NullPointerException
> at
> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
> at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
> at
> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
> at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
> at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
> at
> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
> at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
> at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
> at scala.collection.TraversableOnce$class.to
> (TraversableOnce.scala:273)
> at scala.collection.AbstractIterator.to
> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
> at
> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
> at
> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
> at org.apache.spark.scheduler.Task.run(Task.scala:70)
> at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
> at java.lang.Thread.run(Thread.java:662)
> 15/08/14 09:17:29 WARN TaskSetManager: Lost task 0.0 in stage 15.0 (TID
> 19, localhost): java.lang.NullPointerException
> at
> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
> at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
> at
> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
> at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
> at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
> at
> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
> at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
> at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
> at scala.collection.TraversableOnce$class.to
> (TraversableOnce.scala:273)
> at scala.collection.AbstractIterator.to
> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
> at
> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
> at
> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
> at org.apache.spark.scheduler.Task.run(Task.scala:70)
> at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
> at java.lang.Thread.run(Thread.java:662)
>
> 15/08/14 09:17:29 ERROR TaskSetManager: Task 0 in stage 15.0 failed 1
> times; aborting job
> 15/08/14 09:17:29 INFO TaskSchedulerImpl: Removed TaskSet 15.0, whose
> tasks have all completed, from pool
> 15/08/14 09:17:29 INFO TaskSchedulerImpl: Cancelling stage 15
> 15/08/14 09:17:29 INFO DAGScheduler: ResultStage 15 (show at <console>:24)
> failed in 0.069 s
> 15/08/14 09:17:29 INFO DAGScheduler: Job 15 failed: show at <console>:24,
> took 0.112457 s
> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
> in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage
> 15.0 (TID 19, localhost): java.lang.NullPointerException
> at
> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
> at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
> at
> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
> at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
> at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
> at
> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
> at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
> at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
> at scala.collection.TraversableOnce$class.to
> (TraversableOnce.scala:273)
> at scala.collection.AbstractIterator.to
> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
> at
> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
> at
> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
> at org.apache.spark.scheduler.Task.run(Task.scala:70)
> at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
> at java.lang.Thread.run(Thread.java:662)
>
> Driver stacktrace:
> at org.apache.spark.scheduler.DAGScheduler.org
> <http://org.apache.spark.scheduler.dagscheduler.org/>
> $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
> at
> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
> at
> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
> at scala.Option.foreach(Option.scala:236)
> at
> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
> at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
> at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
>
>
> CASE 2 (with default):
>
> *$ dfb = df.selectExpr("lag(A,1,'x')")$ dfb.show()*
>
> java.lang.ClassCastException: java.lang.String cannot be cast to
> org.apache.hadoop.io.Text
> at
> org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
> at
> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
> at
> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
> at
> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
> at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
> at
> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
> at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
> at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
> at
> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
> at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
> at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
> at scala.collection.TraversableOnce$class.to
> (TraversableOnce.scala:273)
> at scala.collection.AbstractIterator.to
> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
> at
> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
> at
> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
> at org.apache.spark.scheduler.Task.run(Task.scala:70)
> at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
> at java.lang.Thread.run(Thread.java:662)
> 15/08/14 09:16:09 WARN TaskSetManager: Lost task 0.0 in stage 14.0 (TID
> 18, localhost): java.lang.ClassCastException: java.lang.String cannot be
> cast to org.apache.hadoop.io.Text
> at
> org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
> at
> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
> at
> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
> at
> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
> at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
> at
> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
> at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
> at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
> at
> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
> at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
> at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
> at scala.collection.TraversableOnce$class.to
> (TraversableOnce.scala:273)
> at scala.collection.AbstractIterator.to
> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
> at
> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
> at
> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
> at org.apache.spark.scheduler.Task.run(Task.scala:70)
> at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
> at java.lang.Thread.run(Thread.java:662)
>
> 15/08/14 09:16:09 ERROR TaskSetManager: Task 0 in stage 14.0 failed 1
> times; aborting job
> 15/08/14 09:16:09 INFO TaskSchedulerImpl: Removed TaskSet 14.0, whose
> tasks have all completed, from pool
> 15/08/14 09:16:09 INFO TaskSchedulerImpl: Cancelling stage 14
> 15/08/14 09:16:09 INFO DAGScheduler: ResultStage 14 (show at <console>:24)
> failed in 0.082 s
> 15/08/14 09:16:09 INFO DAGScheduler: Job 14 failed: show at <console>:24,
> took 0.137699 s
> 15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_25_piece0 on
> localhost:33504 in memory (size: 3.2 KB, free: 264.9 MB)
> 15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_27_piece0 on
> localhost:33504 in memory (size: 3.2 KB, free: 264.9 MB)
> 15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_26_piece0 on
> localhost:33504 in memory (size: 19.3 KB, free: 264.9 MB)
> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
> in stage 14.0 failed 1 times, most recent failure: Lost task 0.0 in stage
> 14.0 (TID 18, localhost): java.lang.ClassCastException: java.lang.String
> cannot be cast to org.apache.hadoop.io.Text
> at
> org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
> at
> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
> at
> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
> at
> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
> at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
> at
> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
> at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
> at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
> at
> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
> at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
> at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
> at scala.collection.TraversableOnce$class.to
> (TraversableOnce.scala:273)
> at scala.collection.AbstractIterator.to
> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
> at
> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
> at
> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
> at org.apache.spark.scheduler.Task.run(Task.scala:70)
> at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
> at java.lang.Thread.run(Thread.java:662)
>
> Driver stacktrace:
> at org.apache.spark.scheduler.DAGScheduler.org
> <http://org.apache.spark.scheduler.dagscheduler.org/>
> $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
> at
> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
> at
> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
> at scala.Option.foreach(Option.scala:236)
> at
> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
> at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
> at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
>
>
> scala> dfb = df.selectExpr("lag(A,1)")
> dfb: org.apache.spark.sql.DataFrame = ['lag(A,1): string]
>
> scala> dfb.show()
> 15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(243712) called with
> curMem=645979, maxMem=277842493
> 15/08/14 09:17:29 INFO MemoryStore: Block broadcast_28 stored as values in
> memory (estimated size 238.0 KB, free 264.1 MB)
> 15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(19750) called with
> curMem=889691, maxMem=277842493
> 15/08/14 09:17:29 INFO MemoryStore: Block broadcast_28_piece0 stored as
> bytes in memory (estimated size 19.3 KB, free 264.1 MB)
> 15/08/14 09:17:29 INFO BlockManagerInfo: Added broadcast_28_piece0 in
> memory on localhost:33504 (size: 19.3 KB, free: 264.9 MB)
> 15/08/14 09:17:29 INFO SparkContext: Created broadcast 28 from show at
> <console>:24
> 15/08/14 09:17:29 INFO FileInputFormat: Total input paths to process : 1
> 15/08/14 09:17:29 INFO SparkContext: Starting job: show at <console>:24
> 15/08/14 09:17:29 INFO DAGScheduler: Got job 15 (show at <console>:24)
> with 1 output partitions (allowLocal=false)
> 15/08/14 09:17:29 INFO DAGScheduler: Final stage: ResultStage 15(show at
> <console>:24)
> 15/08/14 09:17:29 INFO DAGScheduler: Parents of final stage: List()
> 15/08/14 09:17:29 INFO DAGScheduler: Missing parents: List()
> 15/08/14 09:17:29 INFO DAGScheduler: Submitting ResultStage 15
> (MapPartitionsRDD[61] at show at <console>:24), which has no missing parents
> 15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(5936) called with
> curMem=909441, maxMem=277842493
> 15/08/14 09:17:29 INFO MemoryStore: Block broadcast_29 stored as values in
> memory (estimated size 5.8 KB, free 264.1 MB)
> 15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(3259) called with
> curMem=915377, maxMem=277842493
> 15/08/14 09:17:29 INFO MemoryStore: Block broadcast_29_piece0 stored as
> bytes in memory (estimated size 3.2 KB, free 264.1 MB)
> 15/08/14 09:17:29 INFO BlockManagerInfo: Added broadcast_29_piece0 in
> memory on localhost:33504 (size: 3.2 KB, free: 264.9 MB)
> 15/08/14 09:17:29 INFO SparkContext: Created broadcast 29 from broadcast
> at DAGScheduler.scala:874
> 15/08/14 09:17:29 INFO DAGScheduler: Submitting 1 missing tasks from
> ResultStage 15 (MapPartitionsRDD[61] at show at <console>:24)
> 15/08/14 09:17:29 INFO TaskSchedulerImpl: Adding task set 15.0 with 1 tasks
> 15/08/14 09:17:29 INFO TaskSetManager: Starting task 0.0 in stage 15.0
> (TID 19, localhost, PROCESS_LOCAL, 1409 bytes)
> 15/08/14 09:17:29 INFO Executor: Running task 0.0 in stage 15.0 (TID 19)
> 15/08/14 09:17:29 INFO HadoopRDD: Input split:
> file:/home/adminz/Desktop/trash.json:0+33
> 15/08/14 09:17:29 ERROR Executor: Exception in task 0.0 in stage 15.0 (TID
> 19)
> java.lang.NullPointerException
> at
> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
> at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
> at
> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
> at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
> at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
> at
> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
> at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
> at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
> at scala.collection.TraversableOnce$class.to
> (TraversableOnce.scala:273)
> at scala.collection.AbstractIterator.to
> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
> at
> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
> at
> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
> at org.apache.spark.scheduler.Task.run(Task.scala:70)
> at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
> at java.lang.Thread.run(Thread.java:662)
> 15/08/14 09:17:29 WARN TaskSetManager: Lost task 0.0 in stage 15.0 (TID
> 19, localhost): java.lang.NullPointerException
> at
> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
> at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
> at
> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
> at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
> at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
> at
> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
> at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
> at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
> at scala.collection.TraversableOnce$class.to
> (TraversableOnce.scala:273)
> at scala.collection.AbstractIterator.to
> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
> at
> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
> at
> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
> at org.apache.spark.scheduler.Task.run(Task.scala:70)
> at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
> at java.lang.Thread.run(Thread.java:662)
>
> 15/08/14 09:17:29 ERROR TaskSetManager: Task 0 in stage 15.0 failed 1
> times; aborting job
> 15/08/14 09:17:29 INFO TaskSchedulerImpl: Removed TaskSet 15.0, whose
> tasks have all completed, from pool
> 15/08/14 09:17:29 INFO TaskSchedulerImpl: Cancelling stage 15
> 15/08/14 09:17:29 INFO DAGScheduler: ResultStage 15 (show at <console>:24)
> failed in 0.069 s
> 15/08/14 09:17:29 INFO DAGScheduler: Job 15 failed: show at <console>:24,
> took 0.112457 s
> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
> in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage
> 15.0 (TID 19, localhost): java.lang.NullPointerException
> at
> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
> at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
> at
> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
> at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
> at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
> at
> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
> at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
> at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
> at scala.collection.TraversableOnce$class.to
> (TraversableOnce.scala:273)
> at scala.collection.AbstractIterator.to
> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
> at
> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
> at
> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
> at org.apache.spark.scheduler.Task.run(Task.scala:70)
> at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
> at java.lang.Thread.run(Thread.java:662)
>
> Driver stacktrace:
> at org.apache.spark.scheduler.DAGScheduler.org
> <http://org.apache.spark.scheduler.dagscheduler.org/>
> $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
> at
> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
> at
> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
> at scala.Option.foreach(Option.scala:236)
> at
> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
> at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
> at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
>
>
>
>
>
>
>
Re: Another issue with using lag and lead with data frames
Posted by Salih Oztop <so...@yahoo.com.INVALID>.
Hi Jerry,This blog post is perfect for window functions in Spark.https://databricks.com/blog/2015/07/15/introducing-window-functions-in-spark-sql.html
and a generic sql usage from oracle-base blog.https://oracle-base.com/articles/misc/lag-lead-analytic-functions
It seems you are not using Window part for Order By clause. Kind Regards Salih Oztop
From: Jerry <je...@gmail.com>
To: user <us...@spark.apache.org>
Sent: Friday, August 14, 2015 5:50 PM
Subject: Another issue with using lag and lead with data frames
So it seems like dataframes aren't going give me a break and just work. Now it evaluates but goes nuts if it runs into a null case OR doesn't know how to get the correct data type when I specify the default value as a string expression. Let me know if anyone has a work around to this. PLEASE HELP ME!!! THIS IS DRIVING ME NUTS! Below is what I used:
JSON:
{"A":"a"},
{"A":"c"},
{"A":"B"},
{"A":"d"},
{"A":"A"},
{"A":null}
Reading json:
df = sqlContext.jsonFile("/home/........./Desktop/trash.json")
CASE 1 (no default):
$ dfb = df.selectExpr("lag(A,1)")
$ dfb.show()
Java.lang.NullPointerException
at org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:662)
15/08/14 09:17:29 WARN TaskSetManager: Lost task 0.0 in stage 15.0 (TID 19, localhost): java.lang.NullPointerException
at org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:662)
15/08/14 09:17:29 ERROR TaskSetManager: Task 0 in stage 15.0 failed 1 times; aborting job
15/08/14 09:17:29 INFO TaskSchedulerImpl: Removed TaskSet 15.0, whose tasks have all completed, from pool
15/08/14 09:17:29 INFO TaskSchedulerImpl: Cancelling stage 15
15/08/14 09:17:29 INFO DAGScheduler: ResultStage 15 (show at <console>:24) failed in 0.069 s
15/08/14 09:17:29 INFO DAGScheduler: Job 15 failed: show at <console>:24, took 0.112457 s
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage 15.0 (TID 19, localhost): java.lang.NullPointerException
at org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:662)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
CASE 2 (with default):
$ dfb = df.selectExpr("lag(A,1,'x')")
$ dfb.show()
java.lang.ClassCastException: java.lang.String cannot be cast to org.apache.hadoop.io.Text
at org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
at org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
at org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
at org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:662)
15/08/14 09:16:09 WARN TaskSetManager: Lost task 0.0 in stage 14.0 (TID 18, localhost): java.lang.ClassCastException: java.lang.String cannot be cast to org.apache.hadoop.io.Text
at org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
at org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
at org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
at org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:662)
15/08/14 09:16:09 ERROR TaskSetManager: Task 0 in stage 14.0 failed 1 times; aborting job
15/08/14 09:16:09 INFO TaskSchedulerImpl: Removed TaskSet 14.0, whose tasks have all completed, from pool
15/08/14 09:16:09 INFO TaskSchedulerImpl: Cancelling stage 14
15/08/14 09:16:09 INFO DAGScheduler: ResultStage 14 (show at <console>:24) failed in 0.082 s
15/08/14 09:16:09 INFO DAGScheduler: Job 14 failed: show at <console>:24, took 0.137699 s
15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_25_piece0 on localhost:33504 in memory (size: 3.2 KB, free: 264.9 MB)
15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_27_piece0 on localhost:33504 in memory (size: 3.2 KB, free: 264.9 MB)
15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_26_piece0 on localhost:33504 in memory (size: 19.3 KB, free: 264.9 MB)
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 14.0 failed 1 times, most recent failure: Lost task 0.0 in stage 14.0 (TID 18, localhost): java.lang.ClassCastException: java.lang.String cannot be cast to org.apache.hadoop.io.Text
at org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
at org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
at org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
at org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:662)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
scala> dfb = df.selectExpr("lag(A,1)")
dfb: org.apache.spark.sql.DataFrame = ['lag(A,1): string]
scala> dfb.show()
15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(243712) called with curMem=645979, maxMem=277842493
15/08/14 09:17:29 INFO MemoryStore: Block broadcast_28 stored as values in memory (estimated size 238.0 KB, free 264.1 MB)
15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(19750) called with curMem=889691, maxMem=277842493
15/08/14 09:17:29 INFO MemoryStore: Block broadcast_28_piece0 stored as bytes in memory (estimated size 19.3 KB, free 264.1 MB)
15/08/14 09:17:29 INFO BlockManagerInfo: Added broadcast_28_piece0 in memory on localhost:33504 (size: 19.3 KB, free: 264.9 MB)
15/08/14 09:17:29 INFO SparkContext: Created broadcast 28 from show at <console>:24
15/08/14 09:17:29 INFO FileInputFormat: Total input paths to process : 1
15/08/14 09:17:29 INFO SparkContext: Starting job: show at <console>:24
15/08/14 09:17:29 INFO DAGScheduler: Got job 15 (show at <console>:24) with 1 output partitions (allowLocal=false)
15/08/14 09:17:29 INFO DAGScheduler: Final stage: ResultStage 15(show at <console>:24)
15/08/14 09:17:29 INFO DAGScheduler: Parents of final stage: List()
15/08/14 09:17:29 INFO DAGScheduler: Missing parents: List()
15/08/14 09:17:29 INFO DAGScheduler: Submitting ResultStage 15 (MapPartitionsRDD[61] at show at <console>:24), which has no missing parents
15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(5936) called with curMem=909441, maxMem=277842493
15/08/14 09:17:29 INFO MemoryStore: Block broadcast_29 stored as values in memory (estimated size 5.8 KB, free 264.1 MB)
15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(3259) called with curMem=915377, maxMem=277842493
15/08/14 09:17:29 INFO MemoryStore: Block broadcast_29_piece0 stored as bytes in memory (estimated size 3.2 KB, free 264.1 MB)
15/08/14 09:17:29 INFO BlockManagerInfo: Added broadcast_29_piece0 in memory on localhost:33504 (size: 3.2 KB, free: 264.9 MB)
15/08/14 09:17:29 INFO SparkContext: Created broadcast 29 from broadcast at DAGScheduler.scala:874
15/08/14 09:17:29 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 15 (MapPartitionsRDD[61] at show at <console>:24)
15/08/14 09:17:29 INFO TaskSchedulerImpl: Adding task set 15.0 with 1 tasks
15/08/14 09:17:29 INFO TaskSetManager: Starting task 0.0 in stage 15.0 (TID 19, localhost, PROCESS_LOCAL, 1409 bytes)
15/08/14 09:17:29 INFO Executor: Running task 0.0 in stage 15.0 (TID 19)
15/08/14 09:17:29 INFO HadoopRDD: Input split: file:/home/adminz/Desktop/trash.json:0+33
15/08/14 09:17:29 ERROR Executor: Exception in task 0.0 in stage 15.0 (TID 19)
java.lang.NullPointerException
at org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:662)
15/08/14 09:17:29 WARN TaskSetManager: Lost task 0.0 in stage 15.0 (TID 19, localhost): java.lang.NullPointerException
at org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:662)
15/08/14 09:17:29 ERROR TaskSetManager: Task 0 in stage 15.0 failed 1 times; aborting job
15/08/14 09:17:29 INFO TaskSchedulerImpl: Removed TaskSet 15.0, whose tasks have all completed, from pool
15/08/14 09:17:29 INFO TaskSchedulerImpl: Cancelling stage 15
15/08/14 09:17:29 INFO DAGScheduler: ResultStage 15 (show at <console>:24) failed in 0.069 s
15/08/14 09:17:29 INFO DAGScheduler: Job 15 failed: show at <console>:24, took 0.112457 s
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage 15.0 (TID 19, localhost): java.lang.NullPointerException
at org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:662)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)