You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2015/05/09 11:03:26 UTC

spark git commit: [SPARK-7438] [SPARK CORE] Fixed validation of relativeSD in countApproxDistinct

Repository: spark
Updated Branches:
  refs/heads/master 299262384 -> dda6d9f40


[SPARK-7438] [SPARK CORE] Fixed validation of relativeSD in countApproxDistinct

Author: Vinod K C <vi...@huawei.com>

Closes #5974 from vinodkc/fix_countApproxDistinct_Validation and squashes the following commits:

3a3d59c [Vinod K C] Reverted removal of validation relativeSD<0.000017
799976e [Vinod K C] Removed testcase to assert IAE when relativeSD>3.7
8ddbfae [Vinod K C] Remove blank line
b1b00a3 [Vinod K C] Removed relativeSD validation from python API,RDD.scala will do validation
122d378 [Vinod K C] Fixed validation of relativeSD in  countApproxDistinct


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dda6d9f4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dda6d9f4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dda6d9f4

Branch: refs/heads/master
Commit: dda6d9f4045fa2d1265abffa9d7dbdc967448417
Parents: 2992623
Author: Vinod K C <vi...@huawei.com>
Authored: Sat May 9 10:03:15 2015 +0100
Committer: Sean Owen <so...@cloudera.com>
Committed: Sat May 9 10:03:15 2015 +0100

----------------------------------------------------------------------
 core/src/main/scala/org/apache/spark/rdd/RDD.scala      | 7 ++++---
 core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala | 2 ++
 python/pyspark/rdd.py                                   | 2 --
 python/pyspark/tests.py                                 | 1 -
 4 files changed, 6 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/dda6d9f4/core/src/main/scala/org/apache/spark/rdd/RDD.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 8baf199..7dad30e 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -1161,8 +1161,8 @@ abstract class RDD[T: ClassTag](
    */
   @Experimental
   def countApproxDistinct(p: Int, sp: Int): Long = withScope {
-    require(p >= 4, s"p ($p) must be at least 4")
-    require(sp <= 32, s"sp ($sp) cannot be greater than 32")
+    require(p >= 4, s"p ($p) must be >= 4")
+    require(sp <= 32, s"sp ($sp) must be <= 32")
     require(sp == 0 || p <= sp, s"p ($p) cannot be greater than sp ($sp)")
     val zeroCounter = new HyperLogLogPlus(p, sp)
     aggregate(zeroCounter)(
@@ -1187,8 +1187,9 @@ abstract class RDD[T: ClassTag](
    *                   It must be greater than 0.000017.
    */
   def countApproxDistinct(relativeSD: Double = 0.05): Long = withScope {
+    require(relativeSD > 0.000017, s"accuracy ($relativeSD) must be greater than 0.000017")
     val p = math.ceil(2.0 * math.log(1.054 / relativeSD) / math.log(2)).toInt
-    countApproxDistinct(p, 0)
+    countApproxDistinct(if (p < 4) 4 else p, 0)
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/dda6d9f4/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
----------------------------------------------------------------------
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
index ef8c36a..afc11bd 100644
--- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
@@ -89,6 +89,8 @@ class RDDSuite extends FunSuite with SharedSparkContext {
     val simpleRdd = sc.makeRDD(uniformDistro, 10)
     assert(error(simpleRdd.countApproxDistinct(8, 0), size) < 0.2)
     assert(error(simpleRdd.countApproxDistinct(12, 0), size) < 0.1)
+    assert(error(simpleRdd.countApproxDistinct(0.02), size) < 0.1)
+    assert(error(simpleRdd.countApproxDistinct(0.5), size) < 0.22)
   }
 
   test("SparkContext.union") {

http://git-wip-us.apache.org/repos/asf/spark/blob/dda6d9f4/python/pyspark/rdd.py
----------------------------------------------------------------------
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index d254deb..545c5ad 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -2239,8 +2239,6 @@ class RDD(object):
         """
         if relativeSD < 0.000017:
             raise ValueError("relativeSD should be greater than 0.000017")
-        if relativeSD > 0.37:
-            raise ValueError("relativeSD should be smaller than 0.37")
         # the hash space in Java is 2^32
         hashRDD = self.map(lambda x: portable_hash(x) & 0xFFFFFFFF)
         return hashRDD._to_java_object_rdd().countApproxDistinct(relativeSD)

http://git-wip-us.apache.org/repos/asf/spark/blob/dda6d9f4/python/pyspark/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index ea63a39..09de4d1 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -644,7 +644,6 @@ class RDDTests(ReusedPySparkTestCase):
         self.assertTrue(18 < rdd.map(lambda x: (x, -x)).countApproxDistinct() < 22)
 
         self.assertRaises(ValueError, lambda: rdd.countApproxDistinct(0.00000001))
-        self.assertRaises(ValueError, lambda: rdd.countApproxDistinct(0.5))
 
     def test_histogram(self):
         # empty


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org