You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Sean Owen (JIRA)" <ji...@apache.org> on 2018/01/25 17:35:00 UTC
[jira] [Commented] (SPARK-23215) Dataset Grouping: Index out of
bounds error
[ https://issues.apache.org/jira/browse/SPARK-23215?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16339533#comment-16339533 ]
Sean Owen commented on SPARK-23215:
-----------------------------------
It doesn't seem to get past count(), so this is likely a mismatch between the schema you're declaring for your input and what's actually there. I think you'd have to double check how the CSV is formatted and parsed.
> Dataset Grouping: Index out of bounds error
> -------------------------------------------
>
> Key: SPARK-23215
> URL: https://issues.apache.org/jira/browse/SPARK-23215
> Project: Spark
> Issue Type: Bug
> Components: Spark Core
> Affects Versions: 2.1.0, 2.2.0
> Reporter: Nikhil
> Priority: Major
>
> Peforming groupByKey operation followed by reduceGroups on dataset results in java.lang.ArrayIndexOutOfBoundsException.
> *Input data(*spark_issue.csv*):*
> 1,nikhil
> 2,amit
> 3,rajeev
> 1,nikhil
> 2,amit2
> 3,rajeev2
> *Code:*
> {noformat}
> *no* further _formatting_ is done here
> public class SparkIndexOutOfBoundsIssue {
>
> public static final String CSV_FORMAT = "org.apache.spark.sql.execution.datasources.csv.CSVFileFormat";
>
> public static void main(String[] args) throws IOException {
> String path = "/home/gabbar/Documents/v18/src/v18-gender-age-spark/src/spark_issue.csv";
> SparkSession session = SparkSession.builder().master("local").appName("Test").getOrCreate();
> StructType schema1 = DataTypes
> .createStructType(new StructField[] {
> DataTypes.createStructField("distinct_id", DataTypes.StringType, true),
> DataTypes.createStructField("show_name", DataTypes.StringType, true)
> });
>
> StructType schema2 = DataTypes
> .createStructType(new StructField[] {
> DataTypes.createStructField("colum", DataTypes.StringType, true)
> });
> Dataset<Row> dataset = session.read().format(CSV_FORMAT).option("header", false).schema(schema1).load("/home/gabbar/Documents/v18/src/v18-gender-age-spark/src/spark_issue.csv");
> System.out.println("COUNT1: "+ dataset.count());
> dataset
> .groupByKey(
> (MapFunction<Row, Row>) row -> {
> String[] arr = new String[2];
> arr[0] = row.getAs(row.fieldIndex("distinct_id"));
> arr[1] = row.getAs(row.fieldIndex("show_name"));
> return new GenericRowWithSchema(arr, schema1);
> },RowEncoder.apply(schema1))
> .reduceGroups((a, b) -> {
> Object[] obj = new Object[1];
> obj[0] = "testdata";
> GenericRowWithSchema row = new GenericRowWithSchema(obj, schema2);
> return row ;
> })
> .collect();
> }
>
> }{noformat}
> *Error-stacktrace:*
> 018-01-25 15:24:43,371 [Executor task launch worker-0] INFO org.apache.spark.storage.ShuffleBlockFetcherIterator - Getting 1 non-empty blocks out of 1 blocks
> 2018-01-25 15:24:43,371 [Executor task launch worker-0] INFO org.apache.spark.storage.ShuffleBlockFetcherIterator - Started 0 remote fetches in 0 ms
> 2018-01-25 15:24:43,379 [Executor task launch worker-0] ERROR org.apache.spark.executor.Executor - Exception in task 90.0 in stage 3.0 (TID 199)
> java.lang.ArrayIndexOutOfBoundsException: 1
> at org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:200)
> at org.apache.spark.sql.Row$class.isNullAt(Row.scala:185)
> at org.apache.spark.sql.catalyst.expressions.GenericRow.isNullAt(rows.scala:192)
> at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
> at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
> at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)
> at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246)
> at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240)
> at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784)
> at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784)
> at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
> at org.apache.spark.scheduler.Task.run(Task.scala:85)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> 2018-01-25 15:24:43,389 [dispatcher-event-loop-3] INFO org.apache.spark.scheduler.TaskSetManager - Starting task 123.0 in stage 3.0 (TID 200, localhost, partition 123, ANY, 5275 bytes)
> 2018-01-25 15:24:43,390 [Executor task launch worker-0] INFO org.apache.spark.executor.Executor - Running task 123.0 in stage 3.0 (TID 200)
> 2018-01-25 15:24:43,391 [task-result-getter-3] WARN org.apache.spark.scheduler.TaskSetManager - Lost task 90.0 in stage 3.0 (TID 199, localhost): java.lang.ArrayIndexOutOfBoundsException: 1
> at org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:200)
> at org.apache.spark.sql.Row$class.isNullAt(Row.scala:185)
> at org.apache.spark.sql.catalyst.expressions.GenericRow.isNullAt(rows.scala:192)
> at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
> at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
> at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)
> at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246)
> at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240)
> at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784)
> at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784)
> at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
> at org.apache.spark.scheduler.Task.run(Task.scala:85)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> 2018-01-25 15:24:43,391 [Executor task launch worker-0] INFO org.apache.spark.storage.ShuffleBlockFetcherIterator - Getting 1 non-empty blocks out of 1 blocks
> 2018-01-25 15:24:43,392 [Executor task launch worker-0] INFO org.apache.spark.storage.ShuffleBlockFetcherIterator - Started 0 remote fetches in 1 ms
> 2018-01-25 15:24:43,392 [task-result-getter-3] ERROR org.apache.spark.scheduler.TaskSetManager - Task 90 in stage 3.0 failed 1 times; aborting job
> 2018-01-25 15:24:43,395 [Executor task launch worker-0] INFO org.apache.spark.executor.Executor - Finished task 123.0 in stage 3.0 (TID 200). 2359 bytes result sent to driver
> 2018-01-25 15:24:43,395 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.TaskSchedulerImpl - Cancelling stage 3
> 2018-01-25 15:24:43,397 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.TaskSchedulerImpl - Stage 3 was cancelled
> 2018-01-25 15:24:43,397 [task-result-getter-0] INFO org.apache.spark.scheduler.TaskSetManager - Finished task 123.0 in stage 3.0 (TID 200) in 8 ms on localhost (197/200)
> 2018-01-25 15:24:43,397 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.DAGScheduler - ResultStage 3 (collect at SparkIndexOutOfBoundsIssue.java:49) failed in 1.257 s
> 2018-01-25 15:24:43,397 [task-result-getter-0] INFO org.apache.spark.scheduler.TaskSchedulerImpl - Removed TaskSet 3.0, whose tasks have all completed, from pool
> 2018-01-25 15:24:43,398 [main] INFO org.apache.spark.scheduler.DAGScheduler - Job 1 failed: collect at SparkIndexOutOfBoundsIssue.java:49, took 1.365428 s
> Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 90 in stage 3.0 failed 1 times, most recent failure: Lost task 90.0 in stage 3.0 (TID 199, localhost): java.lang.ArrayIndexOutOfBoundsException: 1
> at org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:200)
> at org.apache.spark.sql.Row$class.isNullAt(Row.scala:185)
> at org.apache.spark.sql.catalyst.expressions.GenericRow.isNullAt(rows.scala:192)
> at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
> at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
> at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)
> at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246)
> at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240)
> at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784)
> at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784)
> at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
> at org.apache.spark.scheduler.Task.run(Task.scala:85)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> Driver stacktrace:
> at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1450)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1438)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1437)
> at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
> at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1437)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
> at scala.Option.foreach(Option.scala:257)
> at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1659)
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1618)
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1607)
> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
> at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1871)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1884)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1897)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1911)
> at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:893)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
> at org.apache.spark.rdd.RDD.collect(RDD.scala:892)
> at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:290)
> at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1.apply(Dataset.scala:2183)
> at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
> at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2532)
> at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$execute$1(Dataset.scala:2182)
> at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collect$1.apply(Dataset.scala:2187)
> at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collect$1.apply(Dataset.scala:2187)
> at org.apache.spark.sql.Dataset.withCallback(Dataset.scala:2545)
> at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collect(Dataset.scala:2187)
> at org.apache.spark.sql.Dataset.collect(Dataset.scala:2163)
> at v18.age.videowatched.SparkIndexOutOfBoundsIssue.main(SparkIndexOutOfBoundsIssue.java:49)
> Caused by: java.lang.ArrayIndexOutOfBoundsException: 1
> at org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:200)
> at org.apache.spark.sql.Row$class.isNullAt(Row.scala:185)
> at org.apache.spark.sql.catalyst.expressions.GenericRow.isNullAt(rows.scala:192)
> at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
> at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
> at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)
> at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246)
> at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240)
> at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784)
> at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784)
> at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
> at org.apache.spark.scheduler.Task.run(Task.scala:85)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> 2018-01-25 15:24:43,401 [Thread-1] INFO org.apache.spark.SparkContext - Invoking stop() from shutdown hook
> 2018-01-25 15:24:43,403 [Thread-1] INFO org.spark_project.jetty.server.ServerConnector - Stopped ServerConnector@1b2c4efb\{HTTP/1.1}
> {0.0.0.0:4041}
> 2018-01-25 15:24:43,404 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@2002348\{/stages/stage/kill,null,UNAVAILABLE}
> 2018-01-25 15:24:43,404 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@3700ec9c\{/api,null,UNAVAILABLE}
> 2018-01-25 15:24:43,404 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@51972dc7\{/,null,UNAVAILABLE}
> 2018-01-25 15:24:43,404 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@294e5088\{/static,null,UNAVAILABLE}
> 2018-01-25 15:24:43,405 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@2c532cd8\{/executors/threadDump/json,null,UNAVAILABLE}
> 2018-01-25 15:24:43,405 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@5f6722d3\{/executors/threadDump,null,UNAVAILABLE}
> 2018-01-25 15:24:43,405 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@39a8312f\{/executors/json,null,UNAVAILABLE}
> 2018-01-25 15:24:43,405 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@2c5529ab\{/executors,null,UNAVAILABLE}
> 2018-01-25 15:24:43,405 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@6492fab5\{/environment/json,null,UNAVAILABLE}
> 2018-01-25 15:24:43,405 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@72c28d64\{/environment,null,UNAVAILABLE}
> 2018-01-25 15:24:43,406 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@2807bdeb\{/storage/rdd/json,null,UNAVAILABLE}
> 2018-01-25 15:24:43,406 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@18920cc\{/storage/rdd,null,UNAVAILABLE}
> 2018-01-25 15:24:43,406 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@6ebf0f36\{/storage/json,null,UNAVAILABLE}
> 2018-01-25 15:24:43,406 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@2e8e8225\{/storage,null,UNAVAILABLE}
> 2018-01-25 15:24:43,406 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@1b765a2c\{/stages/pool/json,null,UNAVAILABLE}
> 2018-01-25 15:24:43,406 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@7103cb56\{/stages/pool,null,UNAVAILABLE}
> 2018-01-25 15:24:43,407 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@178213b\{/stages/stage/json,null,UNAVAILABLE}
> 2018-01-25 15:24:43,407 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@613a8ee1\{/stages/stage,null,UNAVAILABLE}
> 2018-01-25 15:24:43,407 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@41dd05a\{/stages/json,null,UNAVAILABLE}
> 2018-01-25 15:24:43,407 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@67c2e933\{/stages,null,UNAVAILABLE}
> 2018-01-25 15:24:43,407 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@fade1fc\{/jobs/job/json,null,UNAVAILABLE}
> 2018-01-25 15:24:43,408 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@7ca33c24\{/jobs/job,null,UNAVAILABLE}
> 2018-01-25 15:24:43,408 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@61f05988\{/jobs/json,null,UNAVAILABLE}
> 2018-01-25 15:24:43,408 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@59f63e24\{/jobs,null,UNAVAILABLE}
> 2018-01-25 15:24:43,409 [Thread-1] INFO org.apache.spark.ui.SparkUI - Stopped Spark web UI at [http://192.168.1.8:4041|http://192.168.1.8:4041/]
> 2018-01-25 15:24:43,416 [dispatcher-event-loop-9] INFO org.apache.spark.MapOutputTrackerMasterEndpoint - MapOutputTrackerMasterEndpoint stopped!
> 2018-01-25 15:24:43,424 [Thread-1] INFO org.apache.spark.storage.memory.MemoryStore - MemoryStore cleared
> 2018-01-25 15:24:43,424 [Thread-1] INFO org.apache.spark.storage.BlockManager - BlockManager stopped
> 2018-01-25 15:24:43,425 [Thread-1] INFO org.apache.spark.storage.BlockManagerMaster - BlockManagerMaster stopped
> 2018-01-25 15:24:43,427 [dispatcher-event-loop-1] INFO org.apache.spark.scheduler.OutputCommitCoordinator$OutputCommitCoordinatorEndpoint - OutputCommitCoordinator stopped!
> 2018-01-25 15:24:43,428 [Thread-1] INFO org.apache.spark.SparkContext - Successfully stopped SparkContext
> 2018-01-25 15:24:43,428 [Thread-1] INFO org.apache.spark.util.ShutdownHookManager - Shutdown hook called
> 2018-01-25 15:24:43,429 [Thread-1] INFO org.apache.spark.util.ShutdownHookManager - Deleting directory /tmp/spark-978612f4-1e96-4366-9d14-ebe54627e6d4
> *Note:* If we change the data so that all rows are distinct over two columns of input csv,above code does not throw error.
>
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org