You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2019/07/19 06:48:40 UTC
[spark] branch master updated: [SPARK-28440][MLLIB][TEST] Use
TestingUtils to compare floating point values
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 52ddf03 [SPARK-28440][MLLIB][TEST] Use TestingUtils to compare floating point values
52ddf03 is described below
commit 52ddf038ec38c447f7c55ceb2ef9c63490b187a7
Author: Ievgen Prokhorenko <eu...@gmail.com>
AuthorDate: Thu Jul 18 23:48:12 2019 -0700
[SPARK-28440][MLLIB][TEST] Use TestingUtils to compare floating point values
## What changes were proposed in this pull request?
Use `org.apache.spark.mllib.util.TestingUtils` object across `MLLIB` component to compare floating point values in tests.
## How was this patch tested?
`build/mvn test` - existing tests against updated code.
Closes #25191 from eugen-prokhorenko/mllib-testingutils-double-comparison.
Authored-by: Ievgen Prokhorenko <eu...@gmail.com>
Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
.../mllib/evaluation/MultilabelMetricsSuite.scala | 37 +++++++++++-----------
.../spark/mllib/fpm/AssociationRulesSuite.scala | 5 +--
.../org/apache/spark/mllib/fpm/FPGrowthSuite.scala | 3 +-
.../linalg/distributed/IndexedRowMatrixSuite.scala | 3 +-
.../mllib/random/RandomDataGeneratorSuite.scala | 6 ++--
.../spark/mllib/random/RandomRDDsSuite.scala | 11 +++----
.../apache/spark/mllib/stat/CorrelationSuite.scala | 15 +++++----
.../spark/mllib/stat/KernelDensitySuite.scala | 13 ++++----
.../spark/mllib/tree/EnsembleTestHelper.scala | 5 +--
9 files changed, 52 insertions(+), 46 deletions(-)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MultilabelMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MultilabelMetricsSuite.scala
index a660492..03afd29 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MultilabelMetricsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MultilabelMetricsSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.mllib.evaluation
import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.rdd.RDD
class MultilabelMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
@@ -79,24 +80,24 @@ class MultilabelMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
val hammingLoss = (1.0 / (7 * 3)) * (2 + 2 + 1 + 0 + 0 + 1 + 1)
val strictAccuracy = 2.0 / 7
val accuracy = 1.0 / 7 * (1.0 / 3 + 1.0 /3 + 0 + 1.0 / 1 + 2.0 / 2 + 2.0 / 3 + 1.0 / 2)
- assert(math.abs(metrics.precision(0.0) - precision0) < delta)
- assert(math.abs(metrics.precision(1.0) - precision1) < delta)
- assert(math.abs(metrics.precision(2.0) - precision2) < delta)
- assert(math.abs(metrics.recall(0.0) - recall0) < delta)
- assert(math.abs(metrics.recall(1.0) - recall1) < delta)
- assert(math.abs(metrics.recall(2.0) - recall2) < delta)
- assert(math.abs(metrics.f1Measure(0.0) - f1measure0) < delta)
- assert(math.abs(metrics.f1Measure(1.0) - f1measure1) < delta)
- assert(math.abs(metrics.f1Measure(2.0) - f1measure2) < delta)
- assert(math.abs(metrics.microPrecision - microPrecisionClass) < delta)
- assert(math.abs(metrics.microRecall - microRecallClass) < delta)
- assert(math.abs(metrics.microF1Measure - microF1MeasureClass) < delta)
- assert(math.abs(metrics.precision - macroPrecisionDoc) < delta)
- assert(math.abs(metrics.recall - macroRecallDoc) < delta)
- assert(math.abs(metrics.f1Measure - macroF1MeasureDoc) < delta)
- assert(math.abs(metrics.hammingLoss - hammingLoss) < delta)
- assert(math.abs(metrics.subsetAccuracy - strictAccuracy) < delta)
- assert(math.abs(metrics.accuracy - accuracy) < delta)
+ assert(metrics.precision(0.0) ~== precision0 absTol delta)
+ assert(metrics.precision(1.0) ~== precision1 absTol delta)
+ assert(metrics.precision(2.0) ~== precision2 absTol delta)
+ assert(metrics.recall(0.0) ~== recall0 absTol delta)
+ assert(metrics.recall(1.0) ~== recall1 absTol delta)
+ assert(metrics.recall(2.0) ~== recall2 absTol delta)
+ assert(metrics.f1Measure(0.0) ~== f1measure0 absTol delta)
+ assert(metrics.f1Measure(1.0) ~== f1measure1 absTol delta)
+ assert(metrics.f1Measure(2.0) ~== f1measure2 absTol delta)
+ assert(metrics.microPrecision ~== microPrecisionClass absTol delta)
+ assert(metrics.microRecall ~== microRecallClass absTol delta)
+ assert(metrics.microF1Measure ~== microF1MeasureClass absTol delta)
+ assert(metrics.precision ~== macroPrecisionDoc absTol delta)
+ assert(metrics.recall ~== macroRecallDoc absTol delta)
+ assert(metrics.f1Measure ~== macroF1MeasureDoc absTol delta)
+ assert(metrics.hammingLoss ~== hammingLoss absTol delta)
+ assert(metrics.subsetAccuracy ~== strictAccuracy absTol delta)
+ assert(metrics.accuracy ~== accuracy absTol delta)
assert(metrics.labels.sameElements(Array(0.0, 1.0, 2.0)))
}
}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/AssociationRulesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/AssociationRulesSuite.scala
index dcb1f39..26a7569 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/fpm/AssociationRulesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/AssociationRulesSuite.scala
@@ -18,6 +18,7 @@ package org.apache.spark.mllib.fpm
import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
class AssociationRulesSuite extends SparkFunSuite with MLlibTestSparkContext {
@@ -63,7 +64,7 @@ class AssociationRulesSuite extends SparkFunSuite with MLlibTestSparkContext {
[1] 23
*/
assert(results1.size === 23)
- assert(results1.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23)
+ assert(results1.count(rule => rule.confidence ~= 1.0D absTol 1e-6) == 23)
val results2 = ar
.setMinConfidence(0)
@@ -84,7 +85,7 @@ class AssociationRulesSuite extends SparkFunSuite with MLlibTestSparkContext {
[1] 23
*/
assert(results2.size === 30)
- assert(results2.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23)
+ assert(results2.count(rule => rule.confidence ~= 1.0D absTol 1e-6) == 23)
}
}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala
index 20bd2e5..fa8f03b 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala
@@ -18,6 +18,7 @@ package org.apache.spark.mllib.fpm
import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.util.Utils
class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
@@ -172,7 +173,7 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
.collect()
assert(rules.size === 23)
- assert(rules.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23)
+ assert(rules.count(rule => rule.confidence ~= 1.0D absTol 1e-6) == 23)
}
test("FP-Growth using Int type") {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala
index 566ce95..cca4eb4 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala
@@ -22,6 +22,7 @@ import breeze.linalg.{diag => brzDiag, DenseMatrix => BDM, DenseVector => BDV}
import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.rdd.RDD
class IndexedRowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {
@@ -238,7 +239,7 @@ class IndexedRowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {
for (i <- 0 until n; j <- i + 1 until n) {
val trueResult = gram(i, j) / scala.math.sqrt(gram(i, i) * gram(j, j))
- assert(math.abs(G(i, j) - trueResult) < 1e-6)
+ assert(G(i, j) ~== trueResult absTol 1e-6)
}
}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
index e30ad15..8011026 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
@@ -20,9 +20,9 @@ package org.apache.spark.mllib.random
import org.apache.commons.math3.special.Gamma
import org.apache.spark.SparkFunSuite
+import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.util.StatCounter
-// TODO update tests to use TestingUtils for floating point comparison after PR 1367 is merged
class RandomDataGeneratorSuite extends SparkFunSuite {
def apiChecks(gen: RandomDataGenerator[Double]) {
@@ -61,8 +61,8 @@ class RandomDataGeneratorSuite extends SparkFunSuite {
gen.setSeed(seed.toLong)
val sample = (0 until 100000).map { _ => gen.nextValue()}
val stats = new StatCounter(sample)
- assert(math.abs(stats.mean - mean) < epsilon)
- assert(math.abs(stats.stdev - stddev) < epsilon)
+ assert(stats.mean ~== mean absTol epsilon)
+ assert(stats.stdev ~== stddev absTol epsilon)
}
}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomRDDsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomRDDsSuite.scala
index f464d25..9b4dc29 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomRDDsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomRDDsSuite.scala
@@ -23,14 +23,13 @@ import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.rdd.{RandomRDD, RandomRDDPartition}
import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.rdd.RDD
import org.apache.spark.util.StatCounter
/*
* Note: avoid including APIs that do not set the seed for the RNG in unit tests
* in order to guarantee deterministic behavior.
- *
- * TODO update tests to use TestingUtils for floating point comparison after PR 1367 is merged
*/
class RandomRDDsSuite extends SparkFunSuite with MLlibTestSparkContext with Serializable {
@@ -43,8 +42,8 @@ class RandomRDDsSuite extends SparkFunSuite with MLlibTestSparkContext with Seri
val stats = rdd.stats()
assert(expectedSize === stats.count)
assert(expectedNumPartitions === rdd.partitions.size)
- assert(math.abs(stats.mean - expectedMean) < epsilon)
- assert(math.abs(stats.stdev - expectedStddev) < epsilon)
+ assert(stats.mean ~== expectedMean absTol epsilon)
+ assert(stats.stdev ~== expectedStddev absTol epsilon)
}
// assume test RDDs are small
@@ -63,8 +62,8 @@ class RandomRDDsSuite extends SparkFunSuite with MLlibTestSparkContext with Seri
}}
assert(expectedRows === values.size / expectedColumns)
val stats = new StatCounter(values)
- assert(math.abs(stats.mean - expectedMean) < epsilon)
- assert(math.abs(stats.stdev - expectedStddev) < epsilon)
+ assert(stats.mean ~== expectedMean absTol epsilon)
+ assert(stats.stdev ~== expectedStddev absTol epsilon)
}
test("RandomRDD sizes") {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala
index e32767e..4613f7f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala
@@ -26,6 +26,7 @@ import org.apache.spark.mllib.random.RandomRDDs
import org.apache.spark.mllib.stat.correlation.{Correlations, PearsonCorrelation,
SpearmanCorrelation}
import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Logging {
@@ -57,15 +58,15 @@ class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Log
val expected = 0.6546537
val default = Statistics.corr(x, y)
val p1 = Statistics.corr(x, y, "pearson")
- assert(approxEqual(expected, default))
- assert(approxEqual(expected, p1))
+ assert(expected ~== default absTol 1e-6)
+ assert(expected ~== p1 absTol 1e-6)
// numPartitions >= size for input RDDs
for (numParts <- List(xData.size, xData.size * 2)) {
val x1 = sc.parallelize(xData, numParts)
val y1 = sc.parallelize(yData, numParts)
val p2 = Statistics.corr(x1, y1)
- assert(approxEqual(expected, p2))
+ assert(expected ~== p2 absTol 1e-6)
}
// RDD of zero variance
@@ -78,14 +79,14 @@ class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Log
val y = sc.parallelize(yData)
val expected = 0.5
val s1 = Statistics.corr(x, y, "spearman")
- assert(approxEqual(expected, s1))
+ assert(expected ~== s1 absTol 1e-6)
// numPartitions >= size for input RDDs
for (numParts <- List(xData.size, xData.size * 2)) {
val x1 = sc.parallelize(xData, numParts)
val y1 = sc.parallelize(yData, numParts)
val s2 = Statistics.corr(x1, y1, "spearman")
- assert(approxEqual(expected, s2))
+ assert(expected ~== s2 absTol 1e-6)
}
// RDD of zero variance => zero variance in ranks
@@ -141,14 +142,14 @@ class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Log
val a = RandomRDDs.normalRDD(sc, 100000, 10).map(_ + 1000000000.0)
val b = RandomRDDs.normalRDD(sc, 100000, 10).map(_ + 1000000000.0)
val p = Statistics.corr(a, b, method = "pearson")
- assert(approxEqual(p, 0.0, 0.01))
+ assert(p ~== 0.0 absTol 0.01)
}
def approxEqual(v1: Double, v2: Double, threshold: Double = 1e-6): Boolean = {
if (v1.isNaN) {
v2.isNaN
} else {
- math.abs(v1 - v2) <= threshold
+ v1 ~== v2 absTol threshold
}
}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala
index 5feccdf..9cbb3d0 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala
@@ -21,6 +21,7 @@ import org.apache.commons.math3.distribution.NormalDistribution
import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext {
test("kernel density single sample") {
@@ -29,8 +30,8 @@ class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext {
val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
val normal = new NormalDistribution(5.0, 3.0)
val acceptableErr = 1e-6
- assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr)
- assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr)
+ assert(densities(0) ~== normal.density(5.0) absTol acceptableErr)
+ assert(densities(1) ~== normal.density(6.0) absTol acceptableErr)
}
test("kernel density multiple samples") {
@@ -40,9 +41,9 @@ class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext {
val normal1 = new NormalDistribution(5.0, 3.0)
val normal2 = new NormalDistribution(10.0, 3.0)
val acceptableErr = 1e-6
- assert(math.abs(
- densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr)
- assert(math.abs(
- densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr)
+ assert(
+ densities(0) ~== ((normal1.density(5.0) + normal2.density(5.0)) / 2) absTol acceptableErr)
+ assert(
+ densities(1) ~== ((normal1.density(6.0) + normal2.density(6.0)) / 2) absTol acceptableErr)
}
}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala
index 1cc8f34..d43e62b 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala
@@ -22,6 +22,7 @@ import scala.collection.mutable
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
+import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.util.StatCounter
object EnsembleTestHelper {
@@ -43,8 +44,8 @@ object EnsembleTestHelper {
values ++= row
}
val stats = new StatCounter(values)
- assert(math.abs(stats.mean - expectedMean) < epsilon)
- assert(math.abs(stats.stdev - expectedStddev) < epsilon)
+ assert(stats.mean ~== expectedMean absTol epsilon)
+ assert(stats.stdev ~== expectedStddev absTol epsilon)
}
def validateClassifier(
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org