You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@spark.apache.org by Jerry <je...@gmail.com> on 2015/08/14 18:50:59 UTC

Another issue with using lag and lead with data frames

So it seems like dataframes aren't going give me a break and just work. Now
it evaluates but goes nuts if it runs into a null case OR doesn't know how
to get the correct data type when I specify the default value as a string
expression. Let me know if anyone has a work around to this. PLEASE HELP
ME!!!  THIS IS DRIVING ME NUTS! Below is what I used:

JSON:
{"A":"a"},
{"A":"c"},
{"A":"B"},
{"A":"d"},
{"A":"A"},
{"A":null}
Reading json:
df = sqlContext.jsonFile("/home/........./Desktop/trash.json")


CASE 1 (no default):

*$ dfb = df.selectExpr("lag(A,1)")$ dfb.show()*
Java.lang.NullPointerException
    at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)
15/08/14 09:17:29 WARN TaskSetManager: Lost task 0.0 in stage 15.0 (TID 19,
localhost): java.lang.NullPointerException
    at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)

15/08/14 09:17:29 ERROR TaskSetManager: Task 0 in stage 15.0 failed 1
times; aborting job
15/08/14 09:17:29 INFO TaskSchedulerImpl: Removed TaskSet 15.0, whose tasks
have all completed, from pool
15/08/14 09:17:29 INFO TaskSchedulerImpl: Cancelling stage 15
15/08/14 09:17:29 INFO DAGScheduler: ResultStage 15 (show at <console>:24)
failed in 0.069 s
15/08/14 09:17:29 INFO DAGScheduler: Job 15 failed: show at <console>:24,
took 0.112457 s
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage
15.0 (TID 19, localhost): java.lang.NullPointerException
    at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org
$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
    at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
    at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at scala.Option.foreach(Option.scala:236)
    at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
    at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
    at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)


CASE 2 (with default):

*$ dfb = df.selectExpr("lag(A,1,'x')")$ dfb.show()*

java.lang.ClassCastException: java.lang.String cannot be cast to
org.apache.hadoop.io.Text
    at
org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
    at
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
    at
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
    at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)
15/08/14 09:16:09 WARN TaskSetManager: Lost task 0.0 in stage 14.0 (TID 18,
localhost): java.lang.ClassCastException: java.lang.String cannot be cast
to org.apache.hadoop.io.Text
    at
org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
    at
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
    at
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
    at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)

15/08/14 09:16:09 ERROR TaskSetManager: Task 0 in stage 14.0 failed 1
times; aborting job
15/08/14 09:16:09 INFO TaskSchedulerImpl: Removed TaskSet 14.0, whose tasks
have all completed, from pool
15/08/14 09:16:09 INFO TaskSchedulerImpl: Cancelling stage 14
15/08/14 09:16:09 INFO DAGScheduler: ResultStage 14 (show at <console>:24)
failed in 0.082 s
15/08/14 09:16:09 INFO DAGScheduler: Job 14 failed: show at <console>:24,
took 0.137699 s
15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_25_piece0 on
localhost:33504 in memory (size: 3.2 KB, free: 264.9 MB)
15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_27_piece0 on
localhost:33504 in memory (size: 3.2 KB, free: 264.9 MB)
15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_26_piece0 on
localhost:33504 in memory (size: 19.3 KB, free: 264.9 MB)
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
in stage 14.0 failed 1 times, most recent failure: Lost task 0.0 in stage
14.0 (TID 18, localhost): java.lang.ClassCastException: java.lang.String
cannot be cast to org.apache.hadoop.io.Text
    at
org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
    at
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
    at
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
    at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org
$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
    at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
    at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at scala.Option.foreach(Option.scala:236)
    at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
    at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
    at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)


scala> dfb = df.selectExpr("lag(A,1)")
dfb: org.apache.spark.sql.DataFrame = ['lag(A,1): string]

scala> dfb.show()
15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(243712) called with
curMem=645979, maxMem=277842493
15/08/14 09:17:29 INFO MemoryStore: Block broadcast_28 stored as values in
memory (estimated size 238.0 KB, free 264.1 MB)
15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(19750) called with
curMem=889691, maxMem=277842493
15/08/14 09:17:29 INFO MemoryStore: Block broadcast_28_piece0 stored as
bytes in memory (estimated size 19.3 KB, free 264.1 MB)
15/08/14 09:17:29 INFO BlockManagerInfo: Added broadcast_28_piece0 in
memory on localhost:33504 (size: 19.3 KB, free: 264.9 MB)
15/08/14 09:17:29 INFO SparkContext: Created broadcast 28 from show at
<console>:24
15/08/14 09:17:29 INFO FileInputFormat: Total input paths to process : 1
15/08/14 09:17:29 INFO SparkContext: Starting job: show at <console>:24
15/08/14 09:17:29 INFO DAGScheduler: Got job 15 (show at <console>:24) with
1 output partitions (allowLocal=false)
15/08/14 09:17:29 INFO DAGScheduler: Final stage: ResultStage 15(show at
<console>:24)
15/08/14 09:17:29 INFO DAGScheduler: Parents of final stage: List()
15/08/14 09:17:29 INFO DAGScheduler: Missing parents: List()
15/08/14 09:17:29 INFO DAGScheduler: Submitting ResultStage 15
(MapPartitionsRDD[61] at show at <console>:24), which has no missing parents
15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(5936) called with
curMem=909441, maxMem=277842493
15/08/14 09:17:29 INFO MemoryStore: Block broadcast_29 stored as values in
memory (estimated size 5.8 KB, free 264.1 MB)
15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(3259) called with
curMem=915377, maxMem=277842493
15/08/14 09:17:29 INFO MemoryStore: Block broadcast_29_piece0 stored as
bytes in memory (estimated size 3.2 KB, free 264.1 MB)
15/08/14 09:17:29 INFO BlockManagerInfo: Added broadcast_29_piece0 in
memory on localhost:33504 (size: 3.2 KB, free: 264.9 MB)
15/08/14 09:17:29 INFO SparkContext: Created broadcast 29 from broadcast at
DAGScheduler.scala:874
15/08/14 09:17:29 INFO DAGScheduler: Submitting 1 missing tasks from
ResultStage 15 (MapPartitionsRDD[61] at show at <console>:24)
15/08/14 09:17:29 INFO TaskSchedulerImpl: Adding task set 15.0 with 1 tasks
15/08/14 09:17:29 INFO TaskSetManager: Starting task 0.0 in stage 15.0 (TID
19, localhost, PROCESS_LOCAL, 1409 bytes)
15/08/14 09:17:29 INFO Executor: Running task 0.0 in stage 15.0 (TID 19)
15/08/14 09:17:29 INFO HadoopRDD: Input split:
file:/home/adminz/Desktop/trash.json:0+33
15/08/14 09:17:29 ERROR Executor: Exception in task 0.0 in stage 15.0 (TID
19)
java.lang.NullPointerException
    at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)
15/08/14 09:17:29 WARN TaskSetManager: Lost task 0.0 in stage 15.0 (TID 19,
localhost): java.lang.NullPointerException
    at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)

15/08/14 09:17:29 ERROR TaskSetManager: Task 0 in stage 15.0 failed 1
times; aborting job
15/08/14 09:17:29 INFO TaskSchedulerImpl: Removed TaskSet 15.0, whose tasks
have all completed, from pool
15/08/14 09:17:29 INFO TaskSchedulerImpl: Cancelling stage 15
15/08/14 09:17:29 INFO DAGScheduler: ResultStage 15 (show at <console>:24)
failed in 0.069 s
15/08/14 09:17:29 INFO DAGScheduler: Job 15 failed: show at <console>:24,
took 0.112457 s
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage
15.0 (TID 19, localhost): java.lang.NullPointerException
    at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org
$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
    at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
    at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at scala.Option.foreach(Option.scala:236)
    at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
    at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
    at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)

Re: Another issue with using lag and lead with data frames

Posted by Jerry <je...@gmail.com>.
Still not cooperating...

lag(A,1,'X') OVER (ORDER BY A) as LA
                  ^
    at scala.sys.package$.error(package.scala:27)
    at
org.apache.spark.sql.catalyst.SqlParser.parseExpression(SqlParser.scala:45)
    at
org.apache.spark.sql.DataFrame$$anonfun$selectExpr$1.apply(DataFrame.scala:626)
    at
org.apache.spark.sql.DataFrame$$anonfun$selectExpr$1.apply(DataFrame.scala:625)
    at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
    at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
    at
scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
    at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:34)
    at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
    at scala.collection.AbstractTraversable.map(Traversable.scala:105)
    at org.apache.spark.sql.DataFrame.selectExpr(DataFrame.scala:625)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:21)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:26)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:28)
    at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:30)
    at $iwC$$iwC$$iwC$$iwC.<init>(<console>:32)
    at $iwC$$iwC$$iwC.<init>(<console>:34)
    at $iwC$$iwC.<init>(<console>:36)
    at $iwC.<init>(<console>:38)
    at <init>(<console>:40)
    at .<init>(<console>:44)
    at .<clinit>(<console>)
    at .<init>(<console>:7)
    at .<clinit>(<console>)
    at $print(<console>)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
    at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
    at java.lang.reflect.Method.invoke(Method.java:597)
    at
org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065)
    at
org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1338)
    at
org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:840)
    at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:871)
    at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:819)
    at
org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:857)
    at
org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902)
    at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:814)
    at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657)
    at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665)
    at org.apache.spark.repl.SparkILoop.org
$apache$spark$repl$SparkILoop$$loop(SparkILoop.scala:670)
    at
org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply$mcZ$sp(SparkILoop.scala:997)
    at
org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
    at
org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
    at
scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
    at org.apache.spark.repl.SparkILoop.org
$apache$spark$repl$SparkILoop$$process(SparkILoop.scala:945)
    at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1059)
    at org.apache.spark.repl.Main$.main(Main.scala:31)
    at org.apache.spark.repl.Main.main(Main.scala)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
    at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
    at java.lang.reflect.Method.invoke(Method.java:597)
    at
org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:665)
    at
org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:170)
    at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:193)
    at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:112)
    at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)


On Fri, Aug 14, 2015 at 1:39 PM, Jerry <je...@gmail.com> wrote:

> Hi Salih,
> Normally I do sort before performing that operation, but since I've been
> trying to get this working for a week, I'm just loading something simple to
> test if lag works. Earlier I was having DB issues....  so it's been a long
> run of solving one runtime exception after another. Hopefully those links
> point me to something useful. Let me know if you can run the above code/
> what you did different to get that code to run.
>
> Thanks,
>       Jerry
>
> On Fri, Aug 14, 2015 at 1:23 PM, Salih Oztop <so...@yahoo.com> wrote:
>
>> Hi Jerry,
>> This blog post is perfect for window functions in Spark.
>>
>> https://databricks.com/blog/2015/07/15/introducing-window-functions-in-spark-sql.html
>> and a generic sql usage from oracle-base blog.
>> https://oracle-base.com/articles/misc/lag-lead-analytic-functions
>>
>> It seems you are not using Window part for Order By clause.
>>
>> Kind Regards
>> Salih Oztop
>>
>> ------------------------------
>> *From:* Jerry <je...@gmail.com>
>> *To:* user <us...@spark.apache.org>
>> *Sent:* Friday, August 14, 2015 5:50 PM
>> *Subject:* Another issue with using lag and lead with data frames
>>
>> So it seems like dataframes aren't going give me a break and just work.
>> Now it evaluates but goes nuts if it runs into a null case OR doesn't know
>> how to get the correct data type when I specify the default value as a
>> string expression. Let me know if anyone has a work around to this. PLEASE
>> HELP ME!!!  THIS IS DRIVING ME NUTS! Below is what I used:
>>
>> JSON:
>> {"A":"a"},
>> {"A":"c"},
>> {"A":"B"},
>> {"A":"d"},
>> {"A":"A"},
>> {"A":null}
>> Reading json:
>> df = sqlContext.jsonFile("/home/........./Desktop/trash.json")
>>
>>
>> CASE 1 (no default):
>>
>> *$ dfb = df.selectExpr("lag(A,1)")$ dfb.show()*
>> Java.lang.NullPointerException
>>     at
>> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
>>     at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>>     at
>> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>>     at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>>     at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>>     at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>>     at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>>     at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>>     at
>> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>>     at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>>     at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>>     at scala.collection.TraversableOnce$class.to
>> (TraversableOnce.scala:273)
>>     at scala.collection.AbstractIterator.to
>> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>>     at
>> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>>     at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>>     at
>> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>>     at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>>     at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>>     at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>>     at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>>     at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>>     at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>>     at org.apache.spark.scheduler.Task.run(Task.scala:70)
>>     at
>> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>>     at
>> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>>     at
>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>>     at java.lang.Thread.run(Thread.java:662)
>> 15/08/14 09:17:29 WARN TaskSetManager: Lost task 0.0 in stage 15.0 (TID
>> 19, localhost): java.lang.NullPointerException
>>     at
>> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
>>     at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>>     at
>> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>>     at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>>     at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>>     at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>>     at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>>     at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>>     at
>> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>>     at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>>     at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>>     at scala.collection.TraversableOnce$class.to
>> (TraversableOnce.scala:273)
>>     at scala.collection.AbstractIterator.to
>> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>>     at
>> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>>     at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>>     at
>> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>>     at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>>     at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>>     at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>>     at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>>     at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>>     at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>>     at org.apache.spark.scheduler.Task.run(Task.scala:70)
>>     at
>> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>>     at
>> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>>     at
>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>>     at java.lang.Thread.run(Thread.java:662)
>>
>> 15/08/14 09:17:29 ERROR TaskSetManager: Task 0 in stage 15.0 failed 1
>> times; aborting job
>> 15/08/14 09:17:29 INFO TaskSchedulerImpl: Removed TaskSet 15.0, whose
>> tasks have all completed, from pool
>> 15/08/14 09:17:29 INFO TaskSchedulerImpl: Cancelling stage 15
>> 15/08/14 09:17:29 INFO DAGScheduler: ResultStage 15 (show at
>> <console>:24) failed in 0.069 s
>> 15/08/14 09:17:29 INFO DAGScheduler: Job 15 failed: show at <console>:24,
>> took 0.112457 s
>> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
>> in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage
>> 15.0 (TID 19, localhost): java.lang.NullPointerException
>>     at
>> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
>>     at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>>     at
>> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>>     at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>>     at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>>     at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>>     at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>>     at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>>     at
>> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>>     at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>>     at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>>     at scala.collection.TraversableOnce$class.to
>> (TraversableOnce.scala:273)
>>     at scala.collection.AbstractIterator.to
>> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>>     at
>> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>>     at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>>     at
>> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>>     at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>>     at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>>     at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>>     at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>>     at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>>     at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>>     at org.apache.spark.scheduler.Task.run(Task.scala:70)
>>     at
>> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>>     at
>> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>>     at
>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>>     at java.lang.Thread.run(Thread.java:662)
>>
>> Driver stacktrace:
>>     at org.apache.spark.scheduler.DAGScheduler.org
>> <http://org.apache.spark.scheduler.dagscheduler.org/>
>> $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
>>     at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
>>     at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
>>     at
>> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
>>     at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
>>     at
>> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
>>     at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
>>     at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
>>     at scala.Option.foreach(Option.scala:236)
>>     at
>> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
>>     at
>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
>>     at
>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
>>     at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
>>
>>
>> CASE 2 (with default):
>>
>> *$ dfb = df.selectExpr("lag(A,1,'x')")$ dfb.show()*
>>
>> java.lang.ClassCastException: java.lang.String cannot be cast to
>> org.apache.hadoop.io.Text
>>     at
>> org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
>>     at
>> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
>>     at
>> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
>>     at
>> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
>>     at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>>     at
>> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>>     at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>>     at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>>     at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>>     at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>>     at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>>     at
>> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>>     at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>>     at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>>     at scala.collection.TraversableOnce$class.to
>> (TraversableOnce.scala:273)
>>     at scala.collection.AbstractIterator.to
>> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>>     at
>> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>>     at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>>     at
>> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>>     at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>>     at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>>     at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>>     at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>>     at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>>     at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>>     at org.apache.spark.scheduler.Task.run(Task.scala:70)
>>     at
>> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>>     at
>> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>>     at
>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>>     at java.lang.Thread.run(Thread.java:662)
>> 15/08/14 09:16:09 WARN TaskSetManager: Lost task 0.0 in stage 14.0 (TID
>> 18, localhost): java.lang.ClassCastException: java.lang.String cannot be
>> cast to org.apache.hadoop.io.Text
>>     at
>> org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
>>     at
>> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
>>     at
>> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
>>     at
>> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
>>     at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>>     at
>> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>>     at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>>     at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>>     at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>>     at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>>     at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>>     at
>> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>>     at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>>     at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>>     at scala.collection.TraversableOnce$class.to
>> (TraversableOnce.scala:273)
>>     at scala.collection.AbstractIterator.to
>> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>>     at
>> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>>     at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>>     at
>> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>>     at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>>     at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>>     at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>>     at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>>     at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>>     at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>>     at org.apache.spark.scheduler.Task.run(Task.scala:70)
>>     at
>> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>>     at
>> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>>     at
>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>>     at java.lang.Thread.run(Thread.java:662)
>>
>> 15/08/14 09:16:09 ERROR TaskSetManager: Task 0 in stage 14.0 failed 1
>> times; aborting job
>> 15/08/14 09:16:09 INFO TaskSchedulerImpl: Removed TaskSet 14.0, whose
>> tasks have all completed, from pool
>> 15/08/14 09:16:09 INFO TaskSchedulerImpl: Cancelling stage 14
>> 15/08/14 09:16:09 INFO DAGScheduler: ResultStage 14 (show at
>> <console>:24) failed in 0.082 s
>> 15/08/14 09:16:09 INFO DAGScheduler: Job 14 failed: show at <console>:24,
>> took 0.137699 s
>> 15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_25_piece0 on
>> localhost:33504 in memory (size: 3.2 KB, free: 264.9 MB)
>> 15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_27_piece0 on
>> localhost:33504 in memory (size: 3.2 KB, free: 264.9 MB)
>> 15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_26_piece0 on
>> localhost:33504 in memory (size: 19.3 KB, free: 264.9 MB)
>> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
>> in stage 14.0 failed 1 times, most recent failure: Lost task 0.0 in stage
>> 14.0 (TID 18, localhost): java.lang.ClassCastException: java.lang.String
>> cannot be cast to org.apache.hadoop.io.Text
>>     at
>> org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
>>     at
>> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
>>     at
>> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
>>     at
>> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
>>     at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>>     at
>> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>>     at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>>     at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>>     at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>>     at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>>     at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>>     at
>> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>>     at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>>     at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>>     at scala.collection.TraversableOnce$class.to
>> (TraversableOnce.scala:273)
>>     at scala.collection.AbstractIterator.to
>> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>>     at
>> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>>     at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>>     at
>> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>>     at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>>     at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>>     at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>>     at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>>     at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>>     at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>>     at org.apache.spark.scheduler.Task.run(Task.scala:70)
>>     at
>> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>>     at
>> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>>     at
>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>>     at java.lang.Thread.run(Thread.java:662)
>>
>> Driver stacktrace:
>>     at org.apache.spark.scheduler.DAGScheduler.org
>> <http://org.apache.spark.scheduler.dagscheduler.org/>
>> $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
>>     at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
>>     at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
>>     at
>> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
>>     at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
>>     at
>> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
>>     at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
>>     at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
>>     at scala.Option.foreach(Option.scala:236)
>>     at
>> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
>>     at
>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
>>     at
>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
>>     at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
>>
>>
>> scala> dfb = df.selectExpr("lag(A,1)")
>> dfb: org.apache.spark.sql.DataFrame = ['lag(A,1): string]
>>
>> scala> dfb.show()
>> 15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(243712) called with
>> curMem=645979, maxMem=277842493
>> 15/08/14 09:17:29 INFO MemoryStore: Block broadcast_28 stored as values
>> in memory (estimated size 238.0 KB, free 264.1 MB)
>> 15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(19750) called with
>> curMem=889691, maxMem=277842493
>> 15/08/14 09:17:29 INFO MemoryStore: Block broadcast_28_piece0 stored as
>> bytes in memory (estimated size 19.3 KB, free 264.1 MB)
>> 15/08/14 09:17:29 INFO BlockManagerInfo: Added broadcast_28_piece0 in
>> memory on localhost:33504 (size: 19.3 KB, free: 264.9 MB)
>> 15/08/14 09:17:29 INFO SparkContext: Created broadcast 28 from show at
>> <console>:24
>> 15/08/14 09:17:29 INFO FileInputFormat: Total input paths to process : 1
>> 15/08/14 09:17:29 INFO SparkContext: Starting job: show at <console>:24
>> 15/08/14 09:17:29 INFO DAGScheduler: Got job 15 (show at <console>:24)
>> with 1 output partitions (allowLocal=false)
>> 15/08/14 09:17:29 INFO DAGScheduler: Final stage: ResultStage 15(show at
>> <console>:24)
>> 15/08/14 09:17:29 INFO DAGScheduler: Parents of final stage: List()
>> 15/08/14 09:17:29 INFO DAGScheduler: Missing parents: List()
>> 15/08/14 09:17:29 INFO DAGScheduler: Submitting ResultStage 15
>> (MapPartitionsRDD[61] at show at <console>:24), which has no missing parents
>> 15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(5936) called with
>> curMem=909441, maxMem=277842493
>> 15/08/14 09:17:29 INFO MemoryStore: Block broadcast_29 stored as values
>> in memory (estimated size 5.8 KB, free 264.1 MB)
>> 15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(3259) called with
>> curMem=915377, maxMem=277842493
>> 15/08/14 09:17:29 INFO MemoryStore: Block broadcast_29_piece0 stored as
>> bytes in memory (estimated size 3.2 KB, free 264.1 MB)
>> 15/08/14 09:17:29 INFO BlockManagerInfo: Added broadcast_29_piece0 in
>> memory on localhost:33504 (size: 3.2 KB, free: 264.9 MB)
>> 15/08/14 09:17:29 INFO SparkContext: Created broadcast 29 from broadcast
>> at DAGScheduler.scala:874
>> 15/08/14 09:17:29 INFO DAGScheduler: Submitting 1 missing tasks from
>> ResultStage 15 (MapPartitionsRDD[61] at show at <console>:24)
>> 15/08/14 09:17:29 INFO TaskSchedulerImpl: Adding task set 15.0 with 1
>> tasks
>> 15/08/14 09:17:29 INFO TaskSetManager: Starting task 0.0 in stage 15.0
>> (TID 19, localhost, PROCESS_LOCAL, 1409 bytes)
>> 15/08/14 09:17:29 INFO Executor: Running task 0.0 in stage 15.0 (TID 19)
>> 15/08/14 09:17:29 INFO HadoopRDD: Input split:
>> file:/home/adminz/Desktop/trash.json:0+33
>> 15/08/14 09:17:29 ERROR Executor: Exception in task 0.0 in stage 15.0
>> (TID 19)
>> java.lang.NullPointerException
>>     at
>> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
>>     at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>>     at
>> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>>     at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>>     at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>>     at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>>     at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>>     at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>>     at
>> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>>     at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>>     at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>>     at scala.collection.TraversableOnce$class.to
>> (TraversableOnce.scala:273)
>>     at scala.collection.AbstractIterator.to
>> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>>     at
>> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>>     at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>>     at
>> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>>     at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>>     at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>>     at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>>     at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>>     at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>>     at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>>     at org.apache.spark.scheduler.Task.run(Task.scala:70)
>>     at
>> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>>     at
>> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>>     at
>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>>     at java.lang.Thread.run(Thread.java:662)
>> 15/08/14 09:17:29 WARN TaskSetManager: Lost task 0.0 in stage 15.0 (TID
>> 19, localhost): java.lang.NullPointerException
>>     at
>> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
>>     at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>>     at
>> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>>     at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>>     at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>>     at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>>     at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>>     at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>>     at
>> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>>     at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>>     at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>>     at scala.collection.TraversableOnce$class.to
>> (TraversableOnce.scala:273)
>>     at scala.collection.AbstractIterator.to
>> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>>     at
>> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>>     at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>>     at
>> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>>     at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>>     at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>>     at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>>     at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>>     at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>>     at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>>     at org.apache.spark.scheduler.Task.run(Task.scala:70)
>>     at
>> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>>     at
>> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>>     at
>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>>     at java.lang.Thread.run(Thread.java:662)
>>
>> 15/08/14 09:17:29 ERROR TaskSetManager: Task 0 in stage 15.0 failed 1
>> times; aborting job
>> 15/08/14 09:17:29 INFO TaskSchedulerImpl: Removed TaskSet 15.0, whose
>> tasks have all completed, from pool
>> 15/08/14 09:17:29 INFO TaskSchedulerImpl: Cancelling stage 15
>> 15/08/14 09:17:29 INFO DAGScheduler: ResultStage 15 (show at
>> <console>:24) failed in 0.069 s
>> 15/08/14 09:17:29 INFO DAGScheduler: Job 15 failed: show at <console>:24,
>> took 0.112457 s
>> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
>> in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage
>> 15.0 (TID 19, localhost): java.lang.NullPointerException
>>     at
>> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
>>     at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>>     at
>> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>>     at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>>     at
>> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>>     at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>>     at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>>     at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>>     at
>> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>>     at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>>     at
>> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>>     at scala.collection.TraversableOnce$class.to
>> (TraversableOnce.scala:273)
>>     at scala.collection.AbstractIterator.to
>> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>>     at
>> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>>     at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>>     at
>> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>>     at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>>     at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>>     at
>> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>>     at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>>     at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>>     at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>>     at org.apache.spark.scheduler.Task.run(Task.scala:70)
>>     at
>> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>>     at
>> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>>     at
>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>>     at java.lang.Thread.run(Thread.java:662)
>>
>> Driver stacktrace:
>>     at org.apache.spark.scheduler.DAGScheduler.org
>> <http://org.apache.spark.scheduler.dagscheduler.org/>
>> $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
>>     at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
>>     at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
>>     at
>> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
>>     at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
>>     at
>> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
>>     at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
>>     at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
>>     at scala.Option.foreach(Option.scala:236)
>>     at
>> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
>>     at
>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
>>     at
>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
>>     at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
>>
>>
>>
>>
>>
>>
>>
>

Re: Another issue with using lag and lead with data frames

Posted by Jerry <je...@gmail.com>.
Hi Salih,
Normally I do sort before performing that operation, but since I've been
trying to get this working for a week, I'm just loading something simple to
test if lag works. Earlier I was having DB issues....  so it's been a long
run of solving one runtime exception after another. Hopefully those links
point me to something useful. Let me know if you can run the above code/
what you did different to get that code to run.

Thanks,
      Jerry

On Fri, Aug 14, 2015 at 1:23 PM, Salih Oztop <so...@yahoo.com> wrote:

> Hi Jerry,
> This blog post is perfect for window functions in Spark.
>
> https://databricks.com/blog/2015/07/15/introducing-window-functions-in-spark-sql.html
> and a generic sql usage from oracle-base blog.
> https://oracle-base.com/articles/misc/lag-lead-analytic-functions
>
> It seems you are not using Window part for Order By clause.
>
> Kind Regards
> Salih Oztop
>
> ------------------------------
> *From:* Jerry <je...@gmail.com>
> *To:* user <us...@spark.apache.org>
> *Sent:* Friday, August 14, 2015 5:50 PM
> *Subject:* Another issue with using lag and lead with data frames
>
> So it seems like dataframes aren't going give me a break and just work.
> Now it evaluates but goes nuts if it runs into a null case OR doesn't know
> how to get the correct data type when I specify the default value as a
> string expression. Let me know if anyone has a work around to this. PLEASE
> HELP ME!!!  THIS IS DRIVING ME NUTS! Below is what I used:
>
> JSON:
> {"A":"a"},
> {"A":"c"},
> {"A":"B"},
> {"A":"d"},
> {"A":"A"},
> {"A":null}
> Reading json:
> df = sqlContext.jsonFile("/home/........./Desktop/trash.json")
>
>
> CASE 1 (no default):
>
> *$ dfb = df.selectExpr("lag(A,1)")$ dfb.show()*
> Java.lang.NullPointerException
>     at
> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
>     at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>     at
> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>     at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>     at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>     at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>     at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>     at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>     at
> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>     at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>     at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>     at scala.collection.TraversableOnce$class.to
> (TraversableOnce.scala:273)
>     at scala.collection.AbstractIterator.to
> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>     at
> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>     at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>     at
> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>     at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>     at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>     at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>     at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>     at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>     at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>     at org.apache.spark.scheduler.Task.run(Task.scala:70)
>     at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>     at
> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>     at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>     at java.lang.Thread.run(Thread.java:662)
> 15/08/14 09:17:29 WARN TaskSetManager: Lost task 0.0 in stage 15.0 (TID
> 19, localhost): java.lang.NullPointerException
>     at
> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
>     at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>     at
> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>     at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>     at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>     at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>     at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>     at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>     at
> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>     at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>     at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>     at scala.collection.TraversableOnce$class.to
> (TraversableOnce.scala:273)
>     at scala.collection.AbstractIterator.to
> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>     at
> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>     at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>     at
> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>     at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>     at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>     at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>     at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>     at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>     at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>     at org.apache.spark.scheduler.Task.run(Task.scala:70)
>     at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>     at
> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>     at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>     at java.lang.Thread.run(Thread.java:662)
>
> 15/08/14 09:17:29 ERROR TaskSetManager: Task 0 in stage 15.0 failed 1
> times; aborting job
> 15/08/14 09:17:29 INFO TaskSchedulerImpl: Removed TaskSet 15.0, whose
> tasks have all completed, from pool
> 15/08/14 09:17:29 INFO TaskSchedulerImpl: Cancelling stage 15
> 15/08/14 09:17:29 INFO DAGScheduler: ResultStage 15 (show at <console>:24)
> failed in 0.069 s
> 15/08/14 09:17:29 INFO DAGScheduler: Job 15 failed: show at <console>:24,
> took 0.112457 s
> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
> in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage
> 15.0 (TID 19, localhost): java.lang.NullPointerException
>     at
> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
>     at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>     at
> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>     at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>     at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>     at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>     at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>     at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>     at
> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>     at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>     at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>     at scala.collection.TraversableOnce$class.to
> (TraversableOnce.scala:273)
>     at scala.collection.AbstractIterator.to
> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>     at
> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>     at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>     at
> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>     at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>     at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>     at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>     at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>     at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>     at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>     at org.apache.spark.scheduler.Task.run(Task.scala:70)
>     at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>     at
> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>     at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>     at java.lang.Thread.run(Thread.java:662)
>
> Driver stacktrace:
>     at org.apache.spark.scheduler.DAGScheduler.org
> <http://org.apache.spark.scheduler.dagscheduler.org/>
> $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
>     at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
>     at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
>     at
> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
>     at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
>     at
> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
>     at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
>     at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
>     at scala.Option.foreach(Option.scala:236)
>     at
> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
>     at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
>     at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
>     at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
>
>
> CASE 2 (with default):
>
> *$ dfb = df.selectExpr("lag(A,1,'x')")$ dfb.show()*
>
> java.lang.ClassCastException: java.lang.String cannot be cast to
> org.apache.hadoop.io.Text
>     at
> org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
>     at
> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
>     at
> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
>     at
> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
>     at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>     at
> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>     at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>     at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>     at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>     at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>     at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>     at
> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>     at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>     at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>     at scala.collection.TraversableOnce$class.to
> (TraversableOnce.scala:273)
>     at scala.collection.AbstractIterator.to
> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>     at
> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>     at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>     at
> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>     at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>     at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>     at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>     at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>     at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>     at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>     at org.apache.spark.scheduler.Task.run(Task.scala:70)
>     at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>     at
> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>     at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>     at java.lang.Thread.run(Thread.java:662)
> 15/08/14 09:16:09 WARN TaskSetManager: Lost task 0.0 in stage 14.0 (TID
> 18, localhost): java.lang.ClassCastException: java.lang.String cannot be
> cast to org.apache.hadoop.io.Text
>     at
> org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
>     at
> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
>     at
> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
>     at
> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
>     at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>     at
> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>     at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>     at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>     at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>     at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>     at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>     at
> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>     at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>     at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>     at scala.collection.TraversableOnce$class.to
> (TraversableOnce.scala:273)
>     at scala.collection.AbstractIterator.to
> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>     at
> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>     at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>     at
> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>     at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>     at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>     at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>     at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>     at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>     at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>     at org.apache.spark.scheduler.Task.run(Task.scala:70)
>     at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>     at
> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>     at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>     at java.lang.Thread.run(Thread.java:662)
>
> 15/08/14 09:16:09 ERROR TaskSetManager: Task 0 in stage 14.0 failed 1
> times; aborting job
> 15/08/14 09:16:09 INFO TaskSchedulerImpl: Removed TaskSet 14.0, whose
> tasks have all completed, from pool
> 15/08/14 09:16:09 INFO TaskSchedulerImpl: Cancelling stage 14
> 15/08/14 09:16:09 INFO DAGScheduler: ResultStage 14 (show at <console>:24)
> failed in 0.082 s
> 15/08/14 09:16:09 INFO DAGScheduler: Job 14 failed: show at <console>:24,
> took 0.137699 s
> 15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_25_piece0 on
> localhost:33504 in memory (size: 3.2 KB, free: 264.9 MB)
> 15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_27_piece0 on
> localhost:33504 in memory (size: 3.2 KB, free: 264.9 MB)
> 15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_26_piece0 on
> localhost:33504 in memory (size: 19.3 KB, free: 264.9 MB)
> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
> in stage 14.0 failed 1 times, most recent failure: Lost task 0.0 in stage
> 14.0 (TID 18, localhost): java.lang.ClassCastException: java.lang.String
> cannot be cast to org.apache.hadoop.io.Text
>     at
> org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
>     at
> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
>     at
> org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
>     at
> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
>     at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>     at
> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>     at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>     at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>     at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>     at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>     at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>     at
> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>     at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>     at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>     at scala.collection.TraversableOnce$class.to
> (TraversableOnce.scala:273)
>     at scala.collection.AbstractIterator.to
> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>     at
> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>     at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>     at
> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>     at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>     at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>     at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>     at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>     at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>     at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>     at org.apache.spark.scheduler.Task.run(Task.scala:70)
>     at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>     at
> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>     at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>     at java.lang.Thread.run(Thread.java:662)
>
> Driver stacktrace:
>     at org.apache.spark.scheduler.DAGScheduler.org
> <http://org.apache.spark.scheduler.dagscheduler.org/>
> $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
>     at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
>     at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
>     at
> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
>     at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
>     at
> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
>     at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
>     at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
>     at scala.Option.foreach(Option.scala:236)
>     at
> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
>     at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
>     at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
>     at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
>
>
> scala> dfb = df.selectExpr("lag(A,1)")
> dfb: org.apache.spark.sql.DataFrame = ['lag(A,1): string]
>
> scala> dfb.show()
> 15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(243712) called with
> curMem=645979, maxMem=277842493
> 15/08/14 09:17:29 INFO MemoryStore: Block broadcast_28 stored as values in
> memory (estimated size 238.0 KB, free 264.1 MB)
> 15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(19750) called with
> curMem=889691, maxMem=277842493
> 15/08/14 09:17:29 INFO MemoryStore: Block broadcast_28_piece0 stored as
> bytes in memory (estimated size 19.3 KB, free 264.1 MB)
> 15/08/14 09:17:29 INFO BlockManagerInfo: Added broadcast_28_piece0 in
> memory on localhost:33504 (size: 19.3 KB, free: 264.9 MB)
> 15/08/14 09:17:29 INFO SparkContext: Created broadcast 28 from show at
> <console>:24
> 15/08/14 09:17:29 INFO FileInputFormat: Total input paths to process : 1
> 15/08/14 09:17:29 INFO SparkContext: Starting job: show at <console>:24
> 15/08/14 09:17:29 INFO DAGScheduler: Got job 15 (show at <console>:24)
> with 1 output partitions (allowLocal=false)
> 15/08/14 09:17:29 INFO DAGScheduler: Final stage: ResultStage 15(show at
> <console>:24)
> 15/08/14 09:17:29 INFO DAGScheduler: Parents of final stage: List()
> 15/08/14 09:17:29 INFO DAGScheduler: Missing parents: List()
> 15/08/14 09:17:29 INFO DAGScheduler: Submitting ResultStage 15
> (MapPartitionsRDD[61] at show at <console>:24), which has no missing parents
> 15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(5936) called with
> curMem=909441, maxMem=277842493
> 15/08/14 09:17:29 INFO MemoryStore: Block broadcast_29 stored as values in
> memory (estimated size 5.8 KB, free 264.1 MB)
> 15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(3259) called with
> curMem=915377, maxMem=277842493
> 15/08/14 09:17:29 INFO MemoryStore: Block broadcast_29_piece0 stored as
> bytes in memory (estimated size 3.2 KB, free 264.1 MB)
> 15/08/14 09:17:29 INFO BlockManagerInfo: Added broadcast_29_piece0 in
> memory on localhost:33504 (size: 3.2 KB, free: 264.9 MB)
> 15/08/14 09:17:29 INFO SparkContext: Created broadcast 29 from broadcast
> at DAGScheduler.scala:874
> 15/08/14 09:17:29 INFO DAGScheduler: Submitting 1 missing tasks from
> ResultStage 15 (MapPartitionsRDD[61] at show at <console>:24)
> 15/08/14 09:17:29 INFO TaskSchedulerImpl: Adding task set 15.0 with 1 tasks
> 15/08/14 09:17:29 INFO TaskSetManager: Starting task 0.0 in stage 15.0
> (TID 19, localhost, PROCESS_LOCAL, 1409 bytes)
> 15/08/14 09:17:29 INFO Executor: Running task 0.0 in stage 15.0 (TID 19)
> 15/08/14 09:17:29 INFO HadoopRDD: Input split:
> file:/home/adminz/Desktop/trash.json:0+33
> 15/08/14 09:17:29 ERROR Executor: Exception in task 0.0 in stage 15.0 (TID
> 19)
> java.lang.NullPointerException
>     at
> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
>     at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>     at
> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>     at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>     at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>     at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>     at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>     at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>     at
> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>     at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>     at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>     at scala.collection.TraversableOnce$class.to
> (TraversableOnce.scala:273)
>     at scala.collection.AbstractIterator.to
> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>     at
> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>     at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>     at
> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>     at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>     at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>     at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>     at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>     at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>     at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>     at org.apache.spark.scheduler.Task.run(Task.scala:70)
>     at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>     at
> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>     at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>     at java.lang.Thread.run(Thread.java:662)
> 15/08/14 09:17:29 WARN TaskSetManager: Lost task 0.0 in stage 15.0 (TID
> 19, localhost): java.lang.NullPointerException
>     at
> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
>     at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>     at
> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>     at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>     at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>     at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>     at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>     at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>     at
> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>     at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>     at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>     at scala.collection.TraversableOnce$class.to
> (TraversableOnce.scala:273)
>     at scala.collection.AbstractIterator.to
> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>     at
> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>     at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>     at
> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>     at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>     at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>     at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>     at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>     at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>     at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>     at org.apache.spark.scheduler.Task.run(Task.scala:70)
>     at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>     at
> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>     at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>     at java.lang.Thread.run(Thread.java:662)
>
> 15/08/14 09:17:29 ERROR TaskSetManager: Task 0 in stage 15.0 failed 1
> times; aborting job
> 15/08/14 09:17:29 INFO TaskSchedulerImpl: Removed TaskSet 15.0, whose
> tasks have all completed, from pool
> 15/08/14 09:17:29 INFO TaskSchedulerImpl: Cancelling stage 15
> 15/08/14 09:17:29 INFO DAGScheduler: ResultStage 15 (show at <console>:24)
> failed in 0.069 s
> 15/08/14 09:17:29 INFO DAGScheduler: Job 15 failed: show at <console>:24,
> took 0.112457 s
> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
> in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage
> 15.0 (TID 19, localhost): java.lang.NullPointerException
>     at
> org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
>     at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
>     at
> org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
>     at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
>     at
> org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>     at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>     at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
>     at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>     at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>     at
> scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
>     at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
>     at
> scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
>     at scala.collection.TraversableOnce$class.to
> (TraversableOnce.scala:273)
>     at scala.collection.AbstractIterator.to
> <http://scala.collection.abstractiterator.to/>(Iterator.scala:1157)
>     at
> scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
>     at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
>     at
> scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
>     at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
>     at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>     at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
>     at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>     at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
>     at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
>     at org.apache.spark.scheduler.Task.run(Task.scala:70)
>     at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>     at
> java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
>     at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
>     at java.lang.Thread.run(Thread.java:662)
>
> Driver stacktrace:
>     at org.apache.spark.scheduler.DAGScheduler.org
> <http://org.apache.spark.scheduler.dagscheduler.org/>
> $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
>     at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
>     at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
>     at
> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
>     at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
>     at
> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
>     at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
>     at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
>     at scala.Option.foreach(Option.scala:236)
>     at
> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
>     at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
>     at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
>     at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
>
>
>
>
>
>
>

Re: Another issue with using lag and lead with data frames

Posted by Salih Oztop <so...@yahoo.com.INVALID>.
Hi Jerry,This blog post is perfect for window functions in Spark.https://databricks.com/blog/2015/07/15/introducing-window-functions-in-spark-sql.html
and a generic sql usage from oracle-base blog.https://oracle-base.com/articles/misc/lag-lead-analytic-functions

It seems you are not using Window part for Order By clause. Kind Regards Salih Oztop
      From: Jerry <je...@gmail.com>
 To: user <us...@spark.apache.org> 
 Sent: Friday, August 14, 2015 5:50 PM
 Subject: Another issue with using lag and lead with data frames
   
So it seems like dataframes aren't going give me a break and just work. Now it evaluates but goes nuts if it runs into a null case OR doesn't know how to get the correct data type when I specify the default value as a string expression. Let me know if anyone has a work around to this. PLEASE HELP ME!!!  THIS IS DRIVING ME NUTS! Below is what I used:

JSON:
{"A":"a"},
{"A":"c"},
{"A":"B"},
{"A":"d"},
{"A":"A"},
{"A":null}
Reading json:
df = sqlContext.jsonFile("/home/........./Desktop/trash.json")


CASE 1 (no default):
$ dfb = df.selectExpr("lag(A,1)")
$ dfb.show()
Java.lang.NullPointerException
    at org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)
15/08/14 09:17:29 WARN TaskSetManager: Lost task 0.0 in stage 15.0 (TID 19, localhost): java.lang.NullPointerException
    at org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)

15/08/14 09:17:29 ERROR TaskSetManager: Task 0 in stage 15.0 failed 1 times; aborting job
15/08/14 09:17:29 INFO TaskSchedulerImpl: Removed TaskSet 15.0, whose tasks have all completed, from pool 
15/08/14 09:17:29 INFO TaskSchedulerImpl: Cancelling stage 15
15/08/14 09:17:29 INFO DAGScheduler: ResultStage 15 (show at <console>:24) failed in 0.069 s
15/08/14 09:17:29 INFO DAGScheduler: Job 15 failed: show at <console>:24, took 0.112457 s
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage 15.0 (TID 19, localhost): java.lang.NullPointerException
    at org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
    at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at scala.Option.foreach(Option.scala:236)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)


CASE 2 (with default):
$ dfb = df.selectExpr("lag(A,1,'x')")
$ dfb.show()

java.lang.ClassCastException: java.lang.String cannot be cast to org.apache.hadoop.io.Text
    at org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
    at org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
    at org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
    at org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)
15/08/14 09:16:09 WARN TaskSetManager: Lost task 0.0 in stage 14.0 (TID 18, localhost): java.lang.ClassCastException: java.lang.String cannot be cast to org.apache.hadoop.io.Text
    at org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
    at org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
    at org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
    at org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)

15/08/14 09:16:09 ERROR TaskSetManager: Task 0 in stage 14.0 failed 1 times; aborting job
15/08/14 09:16:09 INFO TaskSchedulerImpl: Removed TaskSet 14.0, whose tasks have all completed, from pool 
15/08/14 09:16:09 INFO TaskSchedulerImpl: Cancelling stage 14
15/08/14 09:16:09 INFO DAGScheduler: ResultStage 14 (show at <console>:24) failed in 0.082 s
15/08/14 09:16:09 INFO DAGScheduler: Job 14 failed: show at <console>:24, took 0.137699 s
15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_25_piece0 on localhost:33504 in memory (size: 3.2 KB, free: 264.9 MB)
15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_27_piece0 on localhost:33504 in memory (size: 3.2 KB, free: 264.9 MB)
15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_26_piece0 on localhost:33504 in memory (size: 19.3 KB, free: 264.9 MB)
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 14.0 failed 1 times, most recent failure: Lost task 0.0 in stage 14.0 (TID 18, localhost): java.lang.ClassCastException: java.lang.String cannot be cast to org.apache.hadoop.io.Text
    at org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
    at org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
    at org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
    at org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
    at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at scala.Option.foreach(Option.scala:236)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)


scala> dfb = df.selectExpr("lag(A,1)")
dfb: org.apache.spark.sql.DataFrame = ['lag(A,1): string]

scala> dfb.show()
15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(243712) called with curMem=645979, maxMem=277842493
15/08/14 09:17:29 INFO MemoryStore: Block broadcast_28 stored as values in memory (estimated size 238.0 KB, free 264.1 MB)
15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(19750) called with curMem=889691, maxMem=277842493
15/08/14 09:17:29 INFO MemoryStore: Block broadcast_28_piece0 stored as bytes in memory (estimated size 19.3 KB, free 264.1 MB)
15/08/14 09:17:29 INFO BlockManagerInfo: Added broadcast_28_piece0 in memory on localhost:33504 (size: 19.3 KB, free: 264.9 MB)
15/08/14 09:17:29 INFO SparkContext: Created broadcast 28 from show at <console>:24
15/08/14 09:17:29 INFO FileInputFormat: Total input paths to process : 1
15/08/14 09:17:29 INFO SparkContext: Starting job: show at <console>:24
15/08/14 09:17:29 INFO DAGScheduler: Got job 15 (show at <console>:24) with 1 output partitions (allowLocal=false)
15/08/14 09:17:29 INFO DAGScheduler: Final stage: ResultStage 15(show at <console>:24)
15/08/14 09:17:29 INFO DAGScheduler: Parents of final stage: List()
15/08/14 09:17:29 INFO DAGScheduler: Missing parents: List()
15/08/14 09:17:29 INFO DAGScheduler: Submitting ResultStage 15 (MapPartitionsRDD[61] at show at <console>:24), which has no missing parents
15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(5936) called with curMem=909441, maxMem=277842493
15/08/14 09:17:29 INFO MemoryStore: Block broadcast_29 stored as values in memory (estimated size 5.8 KB, free 264.1 MB)
15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(3259) called with curMem=915377, maxMem=277842493
15/08/14 09:17:29 INFO MemoryStore: Block broadcast_29_piece0 stored as bytes in memory (estimated size 3.2 KB, free 264.1 MB)
15/08/14 09:17:29 INFO BlockManagerInfo: Added broadcast_29_piece0 in memory on localhost:33504 (size: 3.2 KB, free: 264.9 MB)
15/08/14 09:17:29 INFO SparkContext: Created broadcast 29 from broadcast at DAGScheduler.scala:874
15/08/14 09:17:29 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 15 (MapPartitionsRDD[61] at show at <console>:24)
15/08/14 09:17:29 INFO TaskSchedulerImpl: Adding task set 15.0 with 1 tasks
15/08/14 09:17:29 INFO TaskSetManager: Starting task 0.0 in stage 15.0 (TID 19, localhost, PROCESS_LOCAL, 1409 bytes)
15/08/14 09:17:29 INFO Executor: Running task 0.0 in stage 15.0 (TID 19)
15/08/14 09:17:29 INFO HadoopRDD: Input split: file:/home/adminz/Desktop/trash.json:0+33
15/08/14 09:17:29 ERROR Executor: Exception in task 0.0 in stage 15.0 (TID 19)
java.lang.NullPointerException
    at org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)
15/08/14 09:17:29 WARN TaskSetManager: Lost task 0.0 in stage 15.0 (TID 19, localhost): java.lang.NullPointerException
    at org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)

15/08/14 09:17:29 ERROR TaskSetManager: Task 0 in stage 15.0 failed 1 times; aborting job
15/08/14 09:17:29 INFO TaskSchedulerImpl: Removed TaskSet 15.0, whose tasks have all completed, from pool 
15/08/14 09:17:29 INFO TaskSchedulerImpl: Cancelling stage 15
15/08/14 09:17:29 INFO DAGScheduler: ResultStage 15 (show at <console>:24) failed in 0.069 s
15/08/14 09:17:29 INFO DAGScheduler: Job 15 failed: show at <console>:24, took 0.112457 s
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage 15.0 (TID 19, localhost): java.lang.NullPointerException
    at org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
    at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at scala.Option.foreach(Option.scala:236)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)