You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Bryan Jeffrey (JIRA)" <ji...@apache.org> on 2016/07/30 01:37:20 UTC
[jira] [Comment Edited] (SPARK-16797) Repartiton call w/ 0 partitions drops data

    [ https://issues.apache.org/jira/browse/SPARK-16797?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15400356#comment-15400356 ] 

Bryan Jeffrey edited comment on SPARK-16797 at 7/30/16 1:36 AM:
----------------------------------------------------------------

Hello. Here is a simple example using Spark 1.6.1:

{code}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}

object SimpleExample {
  def main(args: Array[String]): Unit = {
    val appName = "Simple Example"
    val sparkConf = new SparkConf().setAppName(appName)
    val sc = new SparkContext(sparkConf)
    val ssc = new StreamingContext(sc, Seconds(5))

    val input = Array(1,2,3,4,5,6,7,8)
    val queue = scala.collection.mutable.Queue(ssc.sparkContext.parallelize(input))
    val data: InputDStream[Int] = ssc.queueStream(queue)

    data.foreachRDD(x => println("Initial Count: " + x.count()))

    val streamPartition = data.repartition(0)
    streamPartition.foreachRDD(x => println("Stream Repartition: " + x.count()))

    val rddPartition = data.transform(x => x.repartition(0))
    rddPartition.foreachRDD(x => println("Rdd Repartition: " + x.count()))

    val singlePartitionStreamPartition = data.repartition(1)
    singlePartitionStreamPartition.foreachRDD(x => println("Stream w/ Single Partition: " + x.count()))

    ssc.start()
    ssc.awaitTermination()
  }
}
{code}

Output:
{code}
Initial Count: 8
Stream Repartition: 0
Rdd Repartition: 0
Stream w/ Single Partition: 8
Initial Count: 0
Stream Repartition: 0
Rdd Repartition: 0
Stream w/ Single Partition: 0
{code}


was (Author: bryan.jeffrey@gmail.com):
Hello. Here is a simple example using Spark 1.6.1:

<ac:structured-macro ac:name="code">
  <ac:plain-text-body><![CDATA[import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}

object SimpleExample {
  def main(args: Array[String]): Unit = {
    val appName = "Simple Example"
    val sparkConf = new SparkConf().setAppName(appName)
    val sc = new SparkContext(sparkConf)
    val ssc = new StreamingContext(sc, Seconds(5))

    val input = Array(1,2,3,4,5,6,7,8)
    val queue = scala.collection.mutable.Queue(ssc.sparkContext.parallelize(input))
    val data: InputDStream[Int] = ssc.queueStream(queue)

    data.foreachRDD(x => println("Initial Count: " + x.count()))

    val streamPartition = data.repartition(0)
    streamPartition.foreachRDD(x => println("Stream Repartition: " + x.count()))

    val rddPartition = data.transform(x => x.repartition(0))
    rddPartition.foreachRDD(x => println("Rdd Repartition: " + x.count()))

    val singlePartitionStreamPartition = data.repartition(1)
    singlePartitionStreamPartition.foreachRDD(x => println("Stream w/ Single Partition: " + x.count()))

    ssc.start()
    ssc.awaitTermination()
  }
}
]></ac:plain-text-body>
</ac:structured-macro>



> Repartiton call w/ 0 partitions drops data
> ------------------------------------------
>
>                 Key: SPARK-16797
>                 URL: https://issues.apache.org/jira/browse/SPARK-16797
>             Project: Spark
>          Issue Type: Bug
>          Components: Spark Core
>    Affects Versions: 1.6.2, 2.0.0
>            Reporter: Bryan Jeffrey
>            Priority: Minor
>              Labels: easyfix
>
> When you call RDD.repartition(0) or DStream.repartition(0), the input data silently dropped. This should not silently fail; instead an exception should be thrown to alert the user to the issue.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org