You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by witgo <gi...@git.apache.org> on 2015/02/26 10:03:56 UTC

[GitHub] spark pull request: [SPARK-4902][CORE] gap-sampling performance op...

Github user witgo commented on the pull request:

    https://github.com/apache/spark/pull/3744#issuecomment-76143785
  
    ```scala
    test("bernoulli sampling benchmark") {
        class BernoulliSamplerBenchmark(val fraction: Double, items: () => Iterator[Int]) extends scala.testing.Benchmark {
          override def run(): Unit = {
            val sampler = new BernoulliSampler[Int](fraction)
            val count = sampler.sample(items()).size
          }
        }
    
        val context = new org.apache.spark.TaskContextImpl(0, 0, 0, 0)
        var fraction = 0.2
        var len = 1e6.toInt
        var noTimes = 1000
        var array = (1 to len).toArray
    
        var iter: () => Iterator[Int] = () => {
          new Iterator[Int] {
            var i = 0
    
            override def hasNext = {
              i < len
            }
    
            override def next = {
              i += 1
              i
            }
          }
        }
        var sampler = new BernoulliSamplerBenchmark(fraction, iter)
        var time = sampler.runBenchmark(noTimes).sum.toDouble / noTimes
        println(s"general sampling fraction=$fraction len=$len use time: $time Ms")
    
        iter = () => {
          new org.apache.spark.InterruptibleIterator(context, array.iterator)
        }
        sampler = new BernoulliSamplerBenchmark(fraction, iter)
        time = sampler.runBenchmark(noTimes).sum.toDouble / noTimes
        println(s"gap sampling fraction=$fraction len=$len use time: $time Ms")
    
        fraction = 0.05
        iter = () => {
          new Iterator[Int] {
            var i = 0
    
            override def hasNext = {
              i < len
            }
    
            override def next = {
              i += 1
              i
            }
          }
        }
        sampler = new BernoulliSamplerBenchmark(fraction, iter)
        time = sampler.runBenchmark(noTimes).sum.toDouble / noTimes
        println(s"general sampling fraction=$fraction len=$len use time: $time Ms")
    
        array = (1 to len).toArray
        iter = () => {
          new org.apache.spark.InterruptibleIterator(context, array.iterator)
        }
        sampler = new BernoulliSamplerBenchmark(fraction, iter)
        time = sampler.runBenchmark(noTimes).sum.toDouble / noTimes
        println(s"gap sampling fraction=$fraction len=$len use time: $time Ms")
    
    
        fraction = 0.01
        iter = () => {
          new Iterator[Int] {
            var i = 0
    
            override def hasNext = {
              i < len
            }
    
            override def next = {
              i += 1
              i
            }
          }
        }
        sampler = new BernoulliSamplerBenchmark(fraction, iter)
        time = sampler.runBenchmark(noTimes).sum.toDouble / noTimes
        println(s"general sampling fraction=$fraction len=$len use time: $time Ms")
    
        array = (1 to len).toArray
        iter = () => {
          new org.apache.spark.InterruptibleIterator(context, array.iterator)
        }
        sampler = new BernoulliSamplerBenchmark(fraction, iter)
        time = sampler.runBenchmark(noTimes).sum.toDouble / noTimes
        println(s"gap sampling fraction=$fraction len=$len use time: $time Ms")
      }
    ```
    
    =>
    
    ```
    general sampling fraction=0.2 len=1000000 use time: 14.562 Ms
    gap sampling fraction=0.2 len=1000000 use time: 16.352 Ms
    general sampling fraction=0.05 len=1000000 use time: 5.408 Ms
    gap sampling fraction=0.05 len=1000000 use time: 4.251 Ms
    general sampling fraction=0.01 len=1000000 use time: 7.528 Ms
    gap sampling fraction=0.01 len=1000000 use time: 1.009 Ms
    ```


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org