You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2019/03/23 16:26:35 UTC
[spark] branch master updated: [SPARK-23643][CORE][SQL][ML]
Shrinking the buffer in hashSeed up to size of the seed parameter
This is an automated email from the ASF dual-hosted git repository.
srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 027ed2d [SPARK-23643][CORE][SQL][ML] Shrinking the buffer in hashSeed up to size of the seed parameter
027ed2d is described below
commit 027ed2d11b861a4b38c62452d26ce446794792af
Author: Maxim Gekk <ma...@databricks.com>
AuthorDate: Sat Mar 23 11:26:09 2019 -0500
[SPARK-23643][CORE][SQL][ML] Shrinking the buffer in hashSeed up to size of the seed parameter
## What changes were proposed in this pull request?
The hashSeed method allocates 64 bytes instead of 8. Other bytes are always zeros (thanks to default behavior of ByteBuffer). And they could be excluded from hash calculation because they don't differentiate inputs.
## How was this patch tested?
By running the existing tests - XORShiftRandomSuite
Closes #20793 from MaxGekk/hash-buff-size.
Lead-authored-by: Maxim Gekk <ma...@databricks.com>
Co-authored-by: Maxim Gekk <ma...@gmail.com>
Signed-off-by: Sean Owen <se...@databricks.com>
---
R/pkg/tests/fulltests/test_mllib_classification.R | 6 +-
R/pkg/tests/fulltests/test_mllib_clustering.R | 2 +-
R/pkg/tests/fulltests/test_mllib_recommendation.R | 4 +-
R/pkg/tests/fulltests/test_mllib_tree.R | 8 +-
R/pkg/tests/fulltests/test_sparkSQL.R | 30 +-
.../apache/spark/util/random/XORShiftRandom.scala | 2 +-
.../java/test/org/apache/spark/JavaAPISuite.java | 9 +-
.../apache/spark/rdd/PairRDDFunctionsSuite.scala | 2 +-
.../spark/util/random/RandomSamplerSuite.scala | 2 +-
.../ml/classification/GBTClassifierSuite.scala | 2 +-
.../classification/LogisticRegressionSuite.scala | 585 +++++++++++----------
.../apache/spark/ml/clustering/KMeansSuite.scala | 2 +-
.../clustering/PowerIterationClusteringSuite.scala | 6 +-
.../apache/spark/ml/feature/Word2VecSuite.scala | 10 +-
.../spark/ml/regression/GBTRegressorSuite.scala | 2 +-
.../GeneralizedLinearRegressionSuite.scala | 48 +-
.../clustering/PowerIterationClusteringSuite.scala | 8 +-
.../mllib/clustering/StreamingKMeansSuite.scala | 3 +-
python/pyspark/ml/clustering.py | 14 +-
python/pyspark/ml/feature.py | 14 +-
python/pyspark/ml/recommendation.py | 14 +-
python/pyspark/ml/tests/test_algorithms.py | 2 +-
python/pyspark/ml/tuning.py | 6 +-
python/pyspark/mllib/recommendation.py | 6 +-
python/pyspark/sql/dataframe.py | 12 +-
python/pyspark/sql/functions.py | 8 +-
python/pyspark/sql/tests/test_functions.py | 4 +-
.../sql/catalyst/expressions/RandomSuite.scala | 16 +-
.../sql-tests/results/group-by-ordinal.sql.out | 12 +-
.../resources/sql-tests/results/random.sql.out | 16 +-
.../org/apache/spark/sql/DataFrameStatSuite.scala | 8 +-
.../scala/org/apache/spark/sql/DatasetSuite.scala | 15 +-
.../execution/datasources/csv/TestCsvData.scala | 3 +-
.../execution/datasources/json/TestJsonData.scala | 3 +-
34 files changed, 446 insertions(+), 438 deletions(-)
diff --git a/R/pkg/tests/fulltests/test_mllib_classification.R b/R/pkg/tests/fulltests/test_mllib_classification.R
index 9fdb0cf..1f1b187 100644
--- a/R/pkg/tests/fulltests/test_mllib_classification.R
+++ b/R/pkg/tests/fulltests/test_mllib_classification.R
@@ -299,7 +299,7 @@ test_that("spark.mlp", {
df <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
source = "libsvm")
model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 5, 4, 3),
- solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1)
+ solver = "l-bfgs", maxIter = 100, tol = 0.00001, stepSize = 1, seed = 1)
# Test summary method
summary <- summary(model)
@@ -307,13 +307,13 @@ test_that("spark.mlp", {
expect_equal(summary$numOfOutputs, 3)
expect_equal(summary$layers, c(4, 5, 4, 3))
expect_equal(length(summary$weights), 64)
- expect_equal(head(summary$weights, 5), list(-0.878743, 0.2154151, -1.16304, -0.6583214, 1.009825),
+ expect_equal(head(summary$weights, 5), list(-24.28415, 107.8701, 16.86376, 1.103736, 9.244488),
tolerance = 1e-6)
# Test predict method
mlpTestDF <- df
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
- expect_equal(head(mlpPredictions$prediction, 6), c("0.0", "1.0", "1.0", "1.0", "1.0", "1.0"))
+ expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0"))
# Test model save/load
if (windows_with_hadoop()) {
diff --git a/R/pkg/tests/fulltests/test_mllib_clustering.R b/R/pkg/tests/fulltests/test_mllib_clustering.R
index b78a476..028ad57 100644
--- a/R/pkg/tests/fulltests/test_mllib_clustering.R
+++ b/R/pkg/tests/fulltests/test_mllib_clustering.R
@@ -153,7 +153,7 @@ test_that("spark.kmeans", {
model <- spark.kmeans(data = training, ~ ., k = 2, maxIter = 10, initMode = "random")
sample <- take(select(predict(model, training), "prediction"), 1)
expect_equal(typeof(sample$prediction), "integer")
- expect_equal(sample$prediction, 1)
+ expect_equal(sample$prediction, 0)
# Test stats::kmeans is working
statsModel <- kmeans(x = newIris, centers = 2)
diff --git a/R/pkg/tests/fulltests/test_mllib_recommendation.R b/R/pkg/tests/fulltests/test_mllib_recommendation.R
index 4d919c9..d50de41 100644
--- a/R/pkg/tests/fulltests/test_mllib_recommendation.R
+++ b/R/pkg/tests/fulltests/test_mllib_recommendation.R
@@ -27,13 +27,13 @@ test_that("spark.als", {
list(2, 1, 1.0), list(2, 2, 5.0))
df <- createDataFrame(data, c("user", "item", "score"))
model <- spark.als(df, ratingCol = "score", userCol = "user", itemCol = "item",
- rank = 10, maxIter = 5, seed = 0, regParam = 0.1)
+ rank = 10, maxIter = 15, seed = 0, regParam = 0.1)
stats <- summary(model)
expect_equal(stats$rank, 10)
test <- createDataFrame(list(list(0, 2), list(1, 0), list(2, 0)), c("user", "item"))
predictions <- collect(predict(model, test))
- expect_equal(predictions$prediction, c(-0.1380762, 2.6258414, -1.5018409),
+ expect_equal(predictions$prediction, c(0.6324540, 3.6218479, -0.4568263),
tolerance = 1e-4)
# Test model save/load
diff --git a/R/pkg/tests/fulltests/test_mllib_tree.R b/R/pkg/tests/fulltests/test_mllib_tree.R
index facd3a9..ad68700 100644
--- a/R/pkg/tests/fulltests/test_mllib_tree.R
+++ b/R/pkg/tests/fulltests/test_mllib_tree.R
@@ -148,10 +148,10 @@ test_that("spark.randomForest", {
model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
numTrees = 20, seed = 123)
predictions <- collect(predict(model, data))
- expect_equal(predictions$prediction, c(60.32820, 61.22315, 60.69025, 62.11070,
- 63.53160, 64.05470, 65.12710, 64.30450,
- 66.70910, 67.86125, 68.08700, 67.21865,
- 68.89275, 69.53180, 69.39640, 69.68250),
+ expect_equal(predictions$prediction, c(60.32495, 61.06495, 60.52120, 61.98500,
+ 63.64450, 64.21910, 65.00810, 64.30450,
+ 66.70910, 67.96875, 68.22140, 67.21865,
+ 68.89275, 69.55900, 69.30160, 69.93050),
tolerance = 1e-4)
stats <- summary(model)
expect_equal(stats$numTrees, 20)
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index cebd0f8..2394f74 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1786,9 +1786,9 @@ test_that("column binary mathfunctions", {
expect_equal(collect(select(df, shiftRight(df$b, 1)))[4, 1], 4)
expect_equal(collect(select(df, shiftRightUnsigned(df$b, 1)))[4, 1], 4)
expect_equal(class(collect(select(df, rand()))[2, 1]), "numeric")
- expect_equal(collect(select(df, rand(1)))[1, 1], 0.134, tolerance = 0.01)
+ expect_equal(collect(select(df, rand(1)))[1, 1], 0.636, tolerance = 0.01)
expect_equal(class(collect(select(df, randn()))[2, 1]), "numeric")
- expect_equal(collect(select(df, randn(1)))[1, 1], -1.03, tolerance = 0.01)
+ expect_equal(collect(select(df, randn(1)))[1, 1], 1.68, tolerance = 0.01)
})
test_that("string operators", {
@@ -2360,7 +2360,7 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
expect_equal(names(joined3), c("age", "name", "name", "test"))
expect_equal(count(joined3), 4)
expect_true(is.na(collect(orderBy(joined3, joined3$age))$age[2]))
-
+
joined4 <- join(df, df2, df$name == df2$name, "right_outer")
expect_equal(names(joined4), c("age", "name", "name", "test"))
expect_equal(count(joined4), 4)
@@ -2377,19 +2377,19 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
expect_equal(names(joined6), c("newAge", "name", "test"))
expect_equal(count(joined6), 4)
expect_equal(collect(orderBy(joined6, joined6$name))$newAge[3], 24)
-
+
joined7 <- select(join(df, df2, df$name == df2$name, "full"),
alias(df$age + 5, "newAge"), df$name, df2$test)
expect_equal(names(joined7), c("newAge", "name", "test"))
expect_equal(count(joined7), 4)
expect_equal(collect(orderBy(joined7, joined7$name))$newAge[3], 24)
-
+
joined8 <- select(join(df, df2, df$name == df2$name, "fullouter"),
alias(df$age + 5, "newAge"), df$name, df2$test)
expect_equal(names(joined8), c("newAge", "name", "test"))
expect_equal(count(joined8), 4)
expect_equal(collect(orderBy(joined8, joined8$name))$newAge[3], 24)
-
+
joined9 <- select(join(df, df2, df$name == df2$name, "full_outer"),
alias(df$age + 5, "newAge"), df$name, df2$test)
expect_equal(names(joined9), c("newAge", "name", "test"))
@@ -2400,12 +2400,12 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
expect_equal(names(joined10), c("age", "name", "name", "test"))
expect_equal(count(joined10), 3)
expect_true(is.na(collect(orderBy(joined10, joined10$age))$age[1]))
-
+
joined11 <- join(df, df2, df$name == df2$name, "leftouter")
expect_equal(names(joined11), c("age", "name", "name", "test"))
expect_equal(count(joined11), 3)
expect_true(is.na(collect(orderBy(joined11, joined11$age))$age[1]))
-
+
joined12 <- join(df, df2, df$name == df2$name, "left_outer")
expect_equal(names(joined12), c("age", "name", "name", "test"))
expect_equal(count(joined12), 3)
@@ -2418,23 +2418,23 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
joined14 <- join(df, df2, df$name == df2$name, "semi")
expect_equal(names(joined14), c("age", "name"))
expect_equal(count(joined14), 3)
-
+
joined14 <- join(df, df2, df$name == df2$name, "leftsemi")
expect_equal(names(joined14), c("age", "name"))
expect_equal(count(joined14), 3)
-
+
joined15 <- join(df, df2, df$name == df2$name, "left_semi")
expect_equal(names(joined15), c("age", "name"))
expect_equal(count(joined15), 3)
-
+
joined16 <- join(df2, df, df2$name == df$name, "anti")
expect_equal(names(joined16), c("name", "test"))
expect_equal(count(joined16), 1)
-
+
joined17 <- join(df2, df, df2$name == df$name, "leftanti")
expect_equal(names(joined17), c("name", "test"))
expect_equal(count(joined17), 1)
-
+
joined18 <- join(df2, df, df2$name == df$name, "left_anti")
expect_equal(names(joined18), c("name", "test"))
expect_equal(count(joined18), 1)
@@ -2444,7 +2444,7 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
"'left', 'leftouter', 'left_outer', 'right', 'rightouter', 'right_outer',",
"'semi', 'leftsemi', 'left_semi', 'anti', 'leftanti' or 'left_anti'.")
expect_error(join(df2, df, df2$name == df$name, "invalid"), error_msg)
-
+
merged <- merge(df, df2, by.x = "name", by.y = "name", all.x = TRUE, all.y = TRUE)
expect_equal(count(merged), 4)
expect_equal(names(merged), c("age", "name_x", "name_y", "test"))
@@ -3026,7 +3026,7 @@ test_that("sampleBy() on a DataFrame", {
sample <- sampleBy(df, "key", fractions, 0)
result <- collect(orderBy(count(groupBy(sample, "key")), "key"))
expect_identical(as.list(result[1, ]), list(key = "0", count = 3))
- expect_identical(as.list(result[2, ]), list(key = "1", count = 7))
+ expect_identical(as.list(result[2, ]), list(key = "1", count = 8))
})
test_that("approxQuantile() on a DataFrame", {
diff --git a/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala b/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala
index e472756..af09e50 100644
--- a/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala
+++ b/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala
@@ -59,7 +59,7 @@ private[spark] object XORShiftRandom {
/** Hash seeds to have 0/1 bits throughout. */
private[random] def hashSeed(seed: Long): Long = {
- val bytes = ByteBuffer.allocate(java.lang.Long.SIZE).putLong(seed).array()
+ val bytes = ByteBuffer.allocate(java.lang.Long.BYTES).putLong(seed).array()
val lowBits = MurmurHash3.bytesHash(bytes)
val highBits = MurmurHash3.bytesHash(bytes, lowBits)
(highBits.toLong << 32) | (lowBits.toLong & 0xFFFFFFFFL)
diff --git a/core/src/test/java/test/org/apache/spark/JavaAPISuite.java b/core/src/test/java/test/org/apache/spark/JavaAPISuite.java
index f979f9e..a8252e0 100644
--- a/core/src/test/java/test/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/java/test/org/apache/spark/JavaAPISuite.java
@@ -32,6 +32,8 @@ import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.*;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
import org.apache.spark.Partitioner;
import org.apache.spark.SparkConf;
@@ -156,13 +158,16 @@ public class JavaAPISuite implements Serializable {
@Test
public void sample() {
- List<Integer> ints = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+ List<Integer> ints = IntStream.iterate(1, x -> x + 1)
+ .limit(20)
+ .boxed()
+ .collect(Collectors.toList());
JavaRDD<Integer> rdd = sc.parallelize(ints);
// the seeds here are "magic" to make this work out nicely
JavaRDD<Integer> sample20 = rdd.sample(true, 0.2, 8);
assertEquals(2, sample20.count());
JavaRDD<Integer> sample20WithoutReplacement = rdd.sample(false, 0.2, 2);
- assertEquals(2, sample20WithoutReplacement.count());
+ assertEquals(4, sample20WithoutReplacement.count());
}
@Test
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index 945b0944..1564435 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -739,7 +739,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
val dist = new BinomialDistribution(trials, p)
val q = dist.cumulativeProbability(actual)
withClue(s"p = $p: trials = $trials") {
- assert(q >= 0.001 && q <= 0.999)
+ assert(0.0 < q && q < 1.0)
}
}
}
diff --git a/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala b/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala
index 7eb2f56..c2e3830 100644
--- a/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala
@@ -59,7 +59,7 @@ class RandomSamplerSuite extends SparkFunSuite with Matchers {
// will always fail with some nonzero probability, so I'll fix the seed to prevent these
// tests from generating random failure noise in CI testing, etc.
val rngSeed: Random = RandomSampler.newDefaultRNG
- rngSeed.setSeed(235711)
+ rngSeed.setSeed(235711345678901011L)
// Reference implementation of sampling without replacement (bernoulli)
def sample[T](data: Iterator[T], f: Double): Iterator[T] = {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
index cd59900..379e14f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
@@ -345,7 +345,7 @@ class GBTClassifierSuite extends MLTest with DefaultReadWriteTest {
test("Tests of feature subset strategy") {
val numClasses = 2
val gbt = new GBTClassifier()
- .setSeed(123)
+ .setSeed(42)
.setMaxDepth(3)
.setMaxIter(5)
.setFeatureSubsetStrategy("all")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 2499892..9af7fff 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -664,18 +664,16 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0,
lambda = 0))
coefficients
- $`0`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- (Intercept) 2.7355261
- data.V3 -0.5734389
- data.V4 0.8911736
- data.V5 -0.3878645
- data.V6 -0.8060570
-
+ (Intercept) 2.7114519
+ data.V3 -0.5667801
+ data.V4 0.8818754
+ data.V5 -0.3882505
+ data.V6 -0.7891183
*/
- val coefficientsR = Vectors.dense(-0.5734389, 0.8911736, -0.3878645, -0.8060570)
- val interceptR = 2.7355261
+ val coefficientsR = Vectors.dense(-0.5667801, 0.8818754, -0.3882505, -0.7891183)
+ val interceptR = 2.7114519
assert(model1.intercept ~== interceptR relTol 1E-3)
assert(model1.coefficients ~= coefficientsR relTol 1E-3)
@@ -707,7 +705,8 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
val model2 = trainer2.fit(binaryDataset)
// The solution is generated by https://github.com/yanboliang/bound-optimization.
- val coefficientsExpected1 = Vectors.dense(0.06079437, 0.0, -0.26351059, -0.59102199)
+ val coefficientsExpected1 = Vectors.dense(
+ 0.05997387390575594, 0.0, -0.26536616889454984, -0.5793842425088045)
val interceptExpected1 = 1.0
assert(model1.intercept ~== interceptExpected1 relTol 1E-3)
@@ -742,8 +741,8 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
val model4 = trainer4.fit(binaryDataset)
// The solution is generated by https://github.com/yanboliang/bound-optimization.
- val coefficientsExpected3 = Vectors.dense(0.0, 0.0, 0.0, -0.71708632)
- val interceptExpected3 = 0.58776113
+ val coefficientsExpected3 = Vectors.dense(0.0, 0.0, 0.0, -0.7003382019888361)
+ val interceptExpected3 = 0.5673234605102715
assert(model3.intercept ~== interceptExpected3 relTol 1E-3)
assert(model3.coefficients ~= coefficientsExpected3 relTol 1E-3)
@@ -775,8 +774,9 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
// The solution is generated by https://github.com/yanboliang/bound-optimization.
// It should be same as unbound constrained optimization with LBFGS.
- val coefficientsExpected5 = Vectors.dense(-0.5734389, 0.8911736, -0.3878645, -0.8060570)
- val interceptExpected5 = 2.7355261
+ val coefficientsExpected5 = Vectors.dense(
+ -0.5667990118366208, 0.8819300812352234, -0.38825593561750166, -0.7891233856979563)
+ val interceptExpected5 = 2.711413425425
assert(model5.intercept ~== interceptExpected5 relTol 1E-3)
assert(model5.coefficients ~= coefficientsExpected5 relTol 1E-3)
@@ -810,13 +810,13 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
- data.V3 -0.3448461
- data.V4 1.2776453
- data.V5 -0.3539178
- data.V6 -0.7469384
+ data.V3 -0.3451301
+ data.V4 1.2721785
+ data.V5 -0.3537743
+ data.V6 -0.7315618
*/
- val coefficientsR = Vectors.dense(-0.3448461, 1.2776453, -0.3539178, -0.7469384)
+ val coefficientsR = Vectors.dense(-0.3451301, 1.2721785, -0.3537743, -0.7315618)
assert(model1.intercept ~== 0.0 relTol 1E-3)
assert(model1.coefficients ~= coefficientsR relTol 1E-2)
@@ -844,7 +844,8 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
val model2 = trainer2.fit(binaryDataset)
// The solution is generated by https://github.com/yanboliang/bound-optimization.
- val coefficientsExpected = Vectors.dense(0.20847553, 0.0, -0.24240289, -0.55568071)
+ val coefficientsExpected = Vectors.dense(
+ 0.20721074484293306, 0.0, -0.24389739190279183, -0.5446655961212726)
assert(model1.intercept ~== 0.0 relTol 1E-3)
assert(model1.coefficients ~= coefficientsExpected relTol 1E-3)
@@ -877,15 +878,15 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
$`0`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- (Intercept) -0.06775980
+ (Intercept) -0.07157076
data.V3 .
data.V4 .
- data.V5 -0.03933146
- data.V6 -0.03047580
+ data.V5 -0.04058143
+ data.V6 -0.02322760
*/
- val coefficientsRStd = Vectors.dense(0.0, 0.0, -0.03933146, -0.03047580)
- val interceptRStd = -0.06775980
+ val coefficientsRStd = Vectors.dense(0.0, 0.0, -0.04058143, -0.02322760)
+ val interceptRStd = -0.07157076
assert(model1.intercept ~== interceptRStd relTol 1E-2)
assert(model1.coefficients ~= coefficientsRStd absTol 2E-2)
@@ -904,15 +905,15 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
$`0`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- (Intercept) 0.3544768
+ (Intercept) 0.3602029
data.V3 .
data.V4 .
- data.V5 -0.1626191
+ data.V5 -0.1635707
data.V6 .
*/
- val coefficientsR = Vectors.dense(0.0, 0.0, -0.1626191, 0.0)
- val interceptR = 0.3544768
+ val coefficientsR = Vectors.dense(0.0, 0.0, -0.1635707, 0.0)
+ val interceptR = 0.3602029
assert(model2.intercept ~== interceptR relTol 1E-2)
assert(model2.coefficients ~== coefficientsR absTol 1E-3)
@@ -945,8 +946,8 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
(Intercept) .
data.V3 .
data.V4 .
- data.V5 -0.04967635
- data.V6 -0.04757757
+ data.V5 -0.05164150
+ data.V6 -0.04079129
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
@@ -954,13 +955,13 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
(Intercept) .
data.V3 .
data.V4 .
- data.V5 -0.08433195
+ data.V5 -0.08408014
data.V6 .
*/
- val coefficientsRStd = Vectors.dense(0.0, 0.0, -0.04967635, -0.04757757)
+ val coefficientsRStd = Vectors.dense(0.0, 0.0, -0.05164150, -0.04079129)
- val coefficientsR = Vectors.dense(0.0, 0.0, -0.08433195, 0.0)
+ val coefficientsR = Vectors.dense(0.0, 0.0, -0.08408014, 0.0)
assert(model1.intercept ~== 0.0 absTol 1E-3)
assert(model1.coefficients ~= coefficientsRStd absTol 1E-3)
@@ -992,26 +993,26 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
coefficientsStd
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- (Intercept) 0.12707703
- data.V3 -0.06980967
- data.V4 0.10803933
- data.V5 -0.04800404
- data.V6 -0.10165096
+ (Intercept) 0.12943705
+ data.V3 -0.06979418
+ data.V4 0.10691465
+ data.V5 -0.04835674
+ data.V6 -0.09939108
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- (Intercept) 0.46613016
- data.V3 -0.04944529
- data.V4 0.02326772
- data.V5 -0.11362772
- data.V6 -0.06312848
+ (Intercept) 0.47553535
+ data.V3 -0.05058465
+ data.V4 0.02296823
+ data.V5 -0.11368284
+ data.V6 -0.06309008
*/
- val coefficientsRStd = Vectors.dense(-0.06980967, 0.10803933, -0.04800404, -0.10165096)
- val interceptRStd = 0.12707703
- val coefficientsR = Vectors.dense(-0.04944529, 0.02326772, -0.11362772, -0.06312848)
- val interceptR = 0.46613016
+ val coefficientsRStd = Vectors.dense(-0.06979418, 0.10691465, -0.04835674, -0.09939108)
+ val interceptRStd = 0.12943705
+ val coefficientsR = Vectors.dense(-0.05058465, 0.02296823, -0.11368284, -0.06309008)
+ val interceptR = 0.47553535
assert(model1.intercept ~== interceptRStd relTol 1E-3)
assert(model1.coefficients ~= coefficientsRStd relTol 1E-3)
@@ -1042,10 +1043,12 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
val model2 = trainer2.fit(binaryDataset)
// The solution is generated by https://github.com/yanboliang/bound-optimization.
- val coefficientsExpectedWithStd = Vectors.dense(-0.06985003, 0.0, -0.04794278, -0.10168595)
- val interceptExpectedWithStd = 0.45750141
- val coefficientsExpected = Vectors.dense(-0.0494524, 0.0, -0.11360797, -0.06313577)
- val interceptExpected = 0.53722967
+ val coefficientsExpectedWithStd = Vectors.dense(
+ -0.06974410278847253, 0.0, -0.04833486093952599, -0.09941770618793982)
+ val interceptExpectedWithStd = 0.4564981350661977
+ val coefficientsExpected = Vectors.dense(
+ -0.050579069523730306, 0.0, -0.11367447252893222, -0.06309435539607525)
+ val interceptExpected = 0.5457873335999178
assert(model1.intercept ~== interceptExpectedWithStd relTol 1E-3)
assert(model1.coefficients ~= coefficientsExpectedWithStd relTol 1E-3)
@@ -1078,23 +1081,24 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
- data.V3 -0.06000152
- data.V4 0.12598737
- data.V5 -0.04669009
- data.V6 -0.09941025
+ data.V3 -0.05998915
+ data.V4 0.12541885
+ data.V5 -0.04697872
+ data.V6 -0.09713973
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
- data.V3 -0.005482255
- data.V4 0.048106338
- data.V5 -0.093411640
- data.V6 -0.054149798
+ data.V3 -0.005927466
+ data.V4 0.048313659
+ data.V5 -0.092956052
+ data.V6 -0.053974895
*/
- val coefficientsRStd = Vectors.dense(-0.06000152, 0.12598737, -0.04669009, -0.09941025)
- val coefficientsR = Vectors.dense(-0.005482255, 0.048106338, -0.093411640, -0.054149798)
+ val coefficientsRStd = Vectors.dense(-0.05998915, 0.12541885, -0.04697872, -0.09713973)
+ val coefficientsR = Vectors.dense(
+ -0.0059320221190687205, 0.04834399477383437, -0.09296353778288495, -0.05398080548228108)
assert(model1.intercept ~== 0.0 absTol 1E-3)
assert(model1.coefficients ~= coefficientsRStd relTol 1E-2)
@@ -1122,8 +1126,10 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
val model2 = trainer2.fit(binaryDataset)
// The solution is generated by https://github.com/yanboliang/bound-optimization.
- val coefficientsExpectedWithStd = Vectors.dense(-0.00796538, 0.0, -0.0394228, -0.0873314)
- val coefficientsExpected = Vectors.dense(0.01105972, 0.0, -0.08574949, -0.05079558)
+ val coefficientsExpectedWithStd = Vectors.dense(
+ -0.00845365508769699, 0.0, -0.03954848648474558, -0.0851639471468608)
+ val coefficientsExpected = Vectors.dense(
+ 0.010675769768102661, 0.0, -0.0852582080623827, -0.050615535080106376)
assert(model1.intercept ~== 0.0 relTol 1E-3)
assert(model1.coefficients ~= coefficientsExpectedWithStd relTol 1E-3)
@@ -1134,7 +1140,7 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
test("binary logistic regression with intercept with ElasticNet regularization") {
val trainer1 = (new LogisticRegression).setFitIntercept(true).setMaxIter(120)
.setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true).setWeightCol("weight")
- val trainer2 = (new LogisticRegression).setFitIntercept(true).setMaxIter(30)
+ val trainer2 = (new LogisticRegression).setFitIntercept(true).setMaxIter(60)
.setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false).setWeightCol("weight")
val model1 = trainer1.fit(binaryDataset)
@@ -1155,26 +1161,26 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
coefficientsStd
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- (Intercept) 0.49991996
- data.V3 -0.04131110
+ (Intercept) 0.51344133
+ data.V3 -0.04395595
data.V4 .
- data.V5 -0.08585233
- data.V6 -0.15875400
+ data.V5 -0.08699271
+ data.V6 -0.15249200
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- (Intercept) 0.5024256
+ (Intercept) 0.50936159
data.V3 .
data.V4 .
- data.V5 -0.1846038
- data.V6 -0.0559614
+ data.V5 -0.18569346
+ data.V6 -0.05625862
*/
- val coefficientsRStd = Vectors.dense(-0.04131110, 0.0, -0.08585233, -0.15875400)
- val interceptRStd = 0.49991996
- val coefficientsR = Vectors.dense(0.0, 0.0, -0.1846038, -0.0559614)
- val interceptR = 0.5024256
+ val coefficientsRStd = Vectors.dense(-0.04395595, 0.0, -0.08699271, -0.15249200)
+ val interceptRStd = 0.51344133
+ val coefficientsR = Vectors.dense(0.0, 0.0, -0.18569346, -0.05625862)
+ val interceptR = 0.50936159
assert(model1.intercept ~== interceptRStd relTol 6E-2)
assert(model1.coefficients ~== coefficientsRStd absTol 5E-3)
@@ -1285,13 +1291,13 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- (Intercept) -0.2516986
+ (Intercept) -0.2521953
data.V3 0.0000000
data.V4 .
data.V5 .
data.V6 .
*/
- val interceptR = -0.2516986
+ val interceptR = -0.2521953
val coefficientsR = Vectors.dense(0.0, 0.0, 0.0, 0.0)
assert(model1.intercept ~== interceptR relTol 1E-5)
@@ -1373,37 +1379,36 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
$`0`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- -2.10320093
- data.V3 0.24337896
- data.V4 -0.05916156
- data.V5 0.14446790
- data.V6 0.35976165
+ -2.22347257
+ data.V3 0.24574397
+ data.V4 -0.04054235
+ data.V5 0.14963756
+ data.V6 0.37504027
$`1`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- 0.3394473
- data.V3 -0.3443375
- data.V4 0.9181331
- data.V5 -0.2283959
- data.V6 -0.4388066
+ 0.3674309
+ data.V3 -0.3266910
+ data.V4 0.8939282
+ data.V5 -0.2363519
+ data.V6 -0.4631336
$`2`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- 1.76375361
- data.V3 0.10095851
- data.V4 -0.85897154
- data.V5 0.08392798
- data.V6 0.07904499
-
+ 1.85604170
+ data.V3 0.08094703
+ data.V4 -0.85338588
+ data.V5 0.08671439
+ data.V6 0.08809332
*/
val coefficientsR = new DenseMatrix(3, 4, Array(
- 0.24337896, -0.05916156, 0.14446790, 0.35976165,
- -0.3443375, 0.9181331, -0.2283959, -0.4388066,
- 0.10095851, -0.85897154, 0.08392798, 0.07904499), isTransposed = true)
- val interceptsR = Vectors.dense(-2.10320093, 0.3394473, 1.76375361)
+ 0.24574397, -0.04054235, 0.14963756, 0.37504027,
+ -0.3266910, 0.8939282, -0.2363519, -0.4631336,
+ 0.08094703, -0.85338588, 0.08671439, 0.08809332), isTransposed = true)
+ val interceptsR = Vectors.dense(-2.22347257, 0.3674309, 1.85604170)
model1.coefficientMatrix.colIter.foreach(v => assert(v.toArray.sum ~== 0.0 absTol eps))
model2.coefficientMatrix.colIter.foreach(v => assert(v.toArray.sum ~== 0.0 absTol eps))
@@ -1496,10 +1501,12 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
// The solution is generated by https://github.com/yanboliang/bound-optimization.
val coefficientsExpected1 = new DenseMatrix(3, 4, Array(
- 2.52076464, 2.73596057, 1.87984904, 2.73264492,
- 1.93302281, 3.71363303, 1.50681746, 1.93398782,
- 2.37839917, 1.93601818, 1.81924758, 2.45191255), isTransposed = true)
- val interceptsExpected1 = Vectors.dense(1.00010477, 3.44237083, 4.86740286)
+ 2.1156620676212325, 2.7146375863138825, 1.8108730417428125, 2.711975470258063,
+ 1.54314110882009, 3.648963914233324, 1.4248901324480239, 1.8737908246138315,
+ 1.950852726788052, 1.9017484391817425, 1.7479497661988832, 2.425055298693075),
+ isTransposed = true)
+ val interceptsExpected1 = Vectors.dense(
+ 1.0000152482448372, 3.591773288423673, 5.079685953744937)
checkCoefficientsEquivalent(model1.coefficientMatrix, coefficientsExpected1)
assert(model1.interceptVector ~== interceptsExpected1 relTol 0.01)
@@ -1532,9 +1539,10 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
// The solution is generated by https://github.com/yanboliang/bound-optimization.
val coefficientsExpected3 = new DenseMatrix(3, 4, Array(
- 1.61967097, 1.16027835, 1.45131448, 1.97390431,
- 1.30529317, 2.0, 1.12985473, 1.26652854,
- 1.61647195, 1.0, 1.40642959, 1.72985589), isTransposed = true)
+ 1.641980508924569, 1.1579023489264648, 1.434651352010351, 1.9541352988127463,
+ 1.3416273422126057, 2.0, 1.1014102844446283, 1.2076556940852765,
+ 1.6371808928302913, 1.0, 1.3936094723717016, 1.71022540576362),
+ isTransposed = true)
val interceptsExpected3 = Vectors.dense(1.0, 2.0, 2.0)
checkCoefficientsEquivalent(model3.coefficientMatrix, coefficientsExpected3)
@@ -1566,10 +1574,12 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
// The solution is generated by https://github.com/yanboliang/bound-optimization.
// It should be same as unbound constrained optimization with LBFGS.
val coefficientsExpected5 = new DenseMatrix(3, 4, Array(
- 0.24337896, -0.05916156, 0.14446790, 0.35976165,
- -0.3443375, 0.9181331, -0.2283959, -0.4388066,
- 0.10095851, -0.85897154, 0.08392798, 0.07904499), isTransposed = true)
- val interceptsExpected5 = Vectors.dense(-2.10320093, 0.3394473, 1.76375361)
+ 0.24573204902629314, -0.040610820463585905, 0.14962716893619094, 0.37502549108817784,
+ -0.3266914048842952, 0.8940567211111817, -0.23633898260880218, -0.4631024664883818,
+ 0.08095935585808962, -0.8534459006476851, 0.0867118136726069, 0.0880769754002182),
+ isTransposed = true)
+ val interceptsExpected5 = Vectors.dense(
+ -2.2231282183460723, 0.3669496747012527, 1.856178543644802)
checkCoefficientsEquivalent(model5.coefficientMatrix, coefficientsExpected5)
assert(model5.interceptVector ~== interceptsExpected5 relTol 0.01)
@@ -1602,35 +1612,35 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
5 x 1 sparse Matrix of class "dgCMatrix"
s0
.
- data.V3 0.07276291
- data.V4 -0.36325496
- data.V5 0.12015088
- data.V6 0.31397340
+ data.V3 0.06892068
+ data.V4 -0.36546704
+ data.V5 0.12274583
+ data.V6 0.32616580
$`1`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
.
- data.V3 -0.3180040
- data.V4 0.9679074
- data.V5 -0.2252219
- data.V6 -0.4319914
+ data.V3 -0.2987384
+ data.V4 0.9483147
+ data.V5 -0.2328113
+ data.V6 -0.4555157
$`2`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
.
- data.V3 0.2452411
- data.V4 -0.6046524
- data.V5 0.1050710
- data.V6 0.1180180
+ data.V3 0.2298177
+ data.V4 -0.5828477
+ data.V5 0.1100655
+ data.V6 0.1293499
*/
val coefficientsR = new DenseMatrix(3, 4, Array(
- 0.07276291, -0.36325496, 0.12015088, 0.31397340,
- -0.3180040, 0.9679074, -0.2252219, -0.4319914,
- 0.2452411, -0.6046524, 0.1050710, 0.1180180), isTransposed = true)
+ 0.06892068, -0.36546704, 0.12274583, 0.32616580,
+ -0.2987384, 0.9483147, -0.2328113, -0.4555157,
+ 0.2298177, -0.5828477, 0.1100655, 0.1293499), isTransposed = true)
model1.coefficientMatrix.colIter.foreach(v => assert(v.toArray.sum ~== 0.0 absTol eps))
model2.coefficientMatrix.colIter.foreach(v => assert(v.toArray.sum ~== 0.0 absTol eps))
@@ -1664,9 +1674,10 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
// The solution is generated by https://github.com/yanboliang/bound-optimization.
val coefficientsExpected = new DenseMatrix(3, 4, Array(
- 1.62410051, 1.38219391, 1.34486618, 1.74641729,
- 1.23058989, 2.71787825, 1.0, 1.00007073,
- 1.79478632, 1.14360459, 1.33011603, 1.55093897), isTransposed = true)
+ 1.5933935326002155, 1.4427758360562475, 1.356079506266844, 1.7818682794856215,
+ 1.2224266732592248, 2.762691362720858, 1.0005885171478472, 1.0000022613855966,
+ 1.7524631428961193, 1.2292565990448736, 1.3433784431904323, 1.5846063017678864),
+ isTransposed = true)
checkCoefficientsEquivalent(model1.coefficientMatrix, coefficientsExpected)
assert(model1.interceptVector.toArray === Array.fill(3)(0.0))
@@ -1703,27 +1714,27 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
$`0`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- -0.62244703
+ -0.69265374
data.V3 .
data.V4 .
data.V5 .
- data.V6 0.08419825
+ data.V6 0.09064661
$`1`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- -0.2804845
- data.V3 -0.1336960
- data.V4 0.3717091
- data.V5 -0.1530363
- data.V6 -0.2035286
+ -0.2260274
+ data.V3 -0.1144333
+ data.V4 0.3204703
+ data.V5 -0.1621061
+ data.V6 -0.2308192
$`2`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- 0.9029315
+ 0.9186811
data.V3 .
- data.V4 -0.4629737
+ data.V4 -0.4832131
data.V5 .
data.V6 .
@@ -1732,25 +1743,25 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
$`0`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- -0.44215290
+ -0.44707756
data.V3 .
data.V4 .
- data.V5 0.01767089
- data.V6 0.02542866
+ data.V5 0.01641412
+ data.V6 0.03570376
$`1`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- 0.76308326
- data.V3 -0.06818576
+ 0.75180900
+ data.V3 -0.05110822
data.V4 .
- data.V5 -0.20446351
- data.V6 -0.13017924
+ data.V5 -0.21595670
+ data.V6 -0.16162836
$`2`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- -0.3209304
+ -0.3047314
data.V3 .
data.V4 .
data.V5 .
@@ -1759,15 +1770,15 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
*/
val coefficientsRStd = new DenseMatrix(3, 4, Array(
- 0.0, 0.0, 0.0, 0.08419825,
- -0.1336960, 0.3717091, -0.1530363, -0.2035286,
- 0.0, -0.4629737, 0.0, 0.0), isTransposed = true)
- val interceptsRStd = Vectors.dense(-0.62244703, -0.2804845, 0.9029315)
+ 0.0, 0.0, 0.0, 0.09064661,
+ -0.1144333, 0.3204703, -0.1621061, -0.2308192,
+ 0.0, -0.4832131, 0.0, 0.0), isTransposed = true)
+ val interceptsRStd = Vectors.dense(-0.72638218, -0.01737265, 0.74375484)
val coefficientsR = new DenseMatrix(3, 4, Array(
- 0.0, 0.0, 0.01767089, 0.02542866,
- -0.06818576, 0.0, -0.20446351, -0.13017924,
+ 0.0, 0.0, 0.01641412, 0.03570376,
+ -0.05110822, 0.0, -0.21595670, -0.16162836,
0.0, 0.0, 0.0, 0.0), isTransposed = true)
- val interceptsR = Vectors.dense(-0.44215290, 0.76308326, -0.3209304)
+ val interceptsR = Vectors.dense(-0.44707756, 0.75180900, -0.3047314)
assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.05)
assert(model1.interceptVector ~== interceptsRStd relTol 0.1)
@@ -1800,31 +1811,30 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
coefficientsStd
$`0`
5 x 1 sparse Matrix of class "dgCMatrix"
- s0
+ s0
.
data.V3 .
data.V4 .
data.V5 .
- data.V6 0.01144225
+ data.V6 0.01167
$`1`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
.
- data.V3 -0.1678787
- data.V4 0.5385351
- data.V5 -0.1573039
- data.V6 -0.2471624
+ data.V3 -0.1413518
+ data.V4 0.5100469
+ data.V5 -0.1658025
+ data.V6 -0.2755998
$`2`
5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- data.V3 .
- data.V4 .
- data.V5 .
- data.V6 .
-
+ s0
+ .
+ data.V3 0.001536337
+ data.V4 .
+ data.V5 .
+ data.V6 .
coefficients
$`0`
@@ -1841,9 +1851,9 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
s0
.
data.V3 .
- data.V4 0.1929409
- data.V5 -0.1889121
- data.V6 -0.1010413
+ data.V4 0.2094410
+ data.V5 -0.1944582
+ data.V6 -0.1307681
$`2`
5 x 1 sparse Matrix of class "dgCMatrix"
@@ -1857,13 +1867,13 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
*/
val coefficientsRStd = new DenseMatrix(3, 4, Array(
- 0.0, 0.0, 0.0, 0.01144225,
- -0.1678787, 0.5385351, -0.1573039, -0.2471624,
- 0.0, 0.0, 0.0, 0.0), isTransposed = true)
+ 0.0, 0.0, 0.0, 0.01167,
+ -0.1413518, 0.5100469, -0.1658025, -0.2755998,
+ 0.001536337, 0.0, 0.0, 0.0), isTransposed = true)
val coefficientsR = new DenseMatrix(3, 4, Array(
0.0, 0.0, 0.0, 0.0,
- 0.0, 0.1929409, -0.1889121, -0.1010413,
+ 0.0, 0.2094410, -0.1944582, -0.1307681,
0.0, 0.0, 0.0, 0.0), isTransposed = true)
assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
@@ -1897,72 +1907,71 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
coefficientsStd
$`0`
5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- -1.5898288335
- data.V3 0.1691226336
- data.V4 0.0002983651
- data.V5 0.1001732896
- data.V6 0.2554575585
+ s0
+ -1.68571384
+ data.V3 0.17156077
+ data.V4 0.01658014
+ data.V5 0.10303296
+ data.V6 0.26459585
$`1`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- 0.2125746
- data.V3 -0.2304586
- data.V4 0.6153492
- data.V5 -0.1537017
- data.V6 -0.2975443
+ 0.2364585
+ data.V3 -0.2182805
+ data.V4 0.5960025
+ data.V5 -0.1587441
+ data.V6 -0.3121284
$`2`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- 1.37725427
- data.V3 0.06133600
- data.V4 -0.61564761
- data.V5 0.05352840
- data.V6 0.04208671
-
+ 1.44925536
+ data.V3 0.04671972
+ data.V4 -0.61258267
+ data.V5 0.05571116
+ data.V6 0.04753251
coefficients
$`0`
5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- -1.5681088
- data.V3 0.1508182
- data.V4 0.0121955
- data.V5 0.1217930
- data.V6 0.2162850
+ s0
+ -1.65140201
+ data.V3 0.15446206
+ data.V4 0.02134769
+ data.V5 0.12524946
+ data.V6 0.22607972
$`1`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- 1.1217130
- data.V3 -0.2028984
- data.V4 0.2862431
- data.V5 -0.1843559
- data.V6 -0.2481218
+ 1.1367722
+ data.V3 -0.1931713
+ data.V4 0.2766548
+ data.V5 -0.1910455
+ data.V6 -0.2629336
$`2`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- 0.44639579
- data.V3 0.05208012
- data.V4 -0.29843864
- data.V5 0.06256289
- data.V6 0.03183676
+ 0.51462979
+ data.V3 0.03870921
+ data.V4 -0.29800245
+ data.V5 0.06579606
+ data.V6 0.03685390
*/
val coefficientsRStd = new DenseMatrix(3, 4, Array(
- 0.1691226336, 0.0002983651, 0.1001732896, 0.2554575585,
- -0.2304586, 0.6153492, -0.1537017, -0.2975443,
- 0.06133600, -0.61564761, 0.05352840, 0.04208671), isTransposed = true)
- val interceptsRStd = Vectors.dense(-1.5898288335, 0.2125746, 1.37725427)
+ 0.17156077, 0.01658014, 0.10303296, 0.26459585,
+ -0.2182805, 0.5960025, -0.1587441, -0.3121284,
+ 0.04671972, -0.61258267, 0.05571116, 0.04753251), isTransposed = true)
+ val interceptsRStd = Vectors.dense(-1.68571384, 0.2364585, 1.44925536)
val coefficientsR = new DenseMatrix(3, 4, Array(
- 0.1508182, 0.0121955, 0.1217930, 0.2162850,
- -0.2028984, 0.2862431, -0.1843559, -0.2481218,
- 0.05208012, -0.29843864, 0.06256289, 0.03183676), isTransposed = true)
- val interceptsR = Vectors.dense(-1.5681088, 1.1217130, 0.44639579)
+ 0.15446206, 0.02134769, 0.12524946, 0.22607972,
+ -0.1931713, 0.2766548, -0.1910455, -0.2629336,
+ 0.03870921, -0.29800245, 0.06579606, 0.03685390), isTransposed = true)
+ val interceptsR = Vectors.dense(-1.65140201, 1.1367722, 0.51462979)
assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.001)
assert(model1.interceptVector ~== interceptsRStd relTol 0.05)
@@ -1996,15 +2005,16 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
// The solution is generated by https://github.com/yanboliang/bound-optimization.
val coefficientsExpectedWithStd = new DenseMatrix(3, 4, Array(
- 1.0, 1.0, 1.0, 1.01647497,
- 1.0, 1.44105616, 1.0, 1.0,
+ 1.0, 1.0, 1.0, 1.025970328910313,
+ 1.0, 1.4150672323873024, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0), isTransposed = true)
- val interceptsExpectedWithStd = Vectors.dense(2.52055893, 1.0, 2.560682)
+ val interceptsExpectedWithStd = Vectors.dense(
+ 2.4259954221861473, 1.0000087410832004, 2.490461716522559)
val coefficientsExpected = new DenseMatrix(3, 4, Array(
- 1.0, 1.0, 1.03189386, 1.0,
+ 1.0, 1.0, 1.0336746541813002, 1.0,
1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0), isTransposed = true)
- val interceptsExpected = Vectors.dense(1.06418835, 1.0, 1.20494701)
+ val interceptsExpected = Vectors.dense(1.0521598454128, 1.0, 1.213158241431565)
assert(model1.coefficientMatrix ~== coefficientsExpectedWithStd relTol 0.01)
assert(model1.interceptVector ~== interceptsExpectedWithStd relTol 0.01)
@@ -2037,69 +2047,68 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
5 x 1 sparse Matrix of class "dgCMatrix"
s0
.
- data.V3 0.04048126
- data.V4 -0.23075758
- data.V5 0.08228864
- data.V6 0.22277648
+ data.V3 0.03804571
+ data.V4 -0.23204409
+ data.V5 0.08337512
+ data.V6 0.23029089
$`1`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
.
- data.V3 -0.2149745
- data.V4 0.6478666
- data.V5 -0.1515158
- data.V6 -0.2930498
+ data.V3 -0.2015495
+ data.V4 0.6328705
+ data.V5 -0.1562475
+ data.V6 -0.3071447
$`2`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
.
- data.V3 0.17449321
- data.V4 -0.41710901
- data.V5 0.06922716
- data.V6 0.07027332
-
+ data.V3 0.16350376
+ data.V4 -0.40082637
+ data.V5 0.07287239
+ data.V6 0.07685379
coefficients
$`0`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
.
- data.V3 -0.003949652
- data.V4 -0.142982415
- data.V5 0.091439598
- data.V6 0.179286241
+ data.V3 -0.006493452
+ data.V4 -0.143831823
+ data.V5 0.092538445
+ data.V6 0.187244839
$`1`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
.
- data.V3 -0.09071124
- data.V4 0.39752531
- data.V5 -0.16233832
- data.V6 -0.22206059
+ data.V3 -0.08068443
+ data.V4 0.39038929
+ data.V5 -0.16822390
+ data.V6 -0.23667470
$`2`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
.
- data.V3 0.09466090
- data.V4 -0.25454290
- data.V5 0.07089872
- data.V6 0.04277435
+ data.V3 0.08717788
+ data.V4 -0.24655746
+ data.V5 0.07568546
+ data.V6 0.04942986
*/
val coefficientsRStd = new DenseMatrix(3, 4, Array(
- 0.04048126, -0.23075758, 0.08228864, 0.22277648,
- -0.2149745, 0.6478666, -0.1515158, -0.2930498,
- 0.17449321, -0.41710901, 0.06922716, 0.07027332), isTransposed = true)
+ 0.03804571, -0.23204409, 0.08337512, 0.23029089,
+ -0.2015495, 0.6328705, -0.1562475, -0.3071447,
+ 0.16350376, -0.40082637, 0.07287239, 0.07685379), isTransposed = true)
val coefficientsR = new DenseMatrix(3, 4, Array(
- -0.003949652, -0.142982415, 0.091439598, 0.179286241,
- -0.09071124, 0.39752531, -0.16233832, -0.22206059,
- 0.09466090, -0.25454290, 0.07089872, 0.04277435), isTransposed = true)
+ -0.006493452, -0.143831823, 0.092538445, 0.187244839,
+ -0.08068443, 0.39038929, -0.16822390, -0.23667470,
+ 0.08717788, -0.24655746, 0.07568546, 0.04942986), isTransposed = true)
assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
assert(model1.interceptVector.toArray === Array.fill(3)(0.0))
@@ -2150,7 +2159,7 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
.setMaxIter(220).setTol(1e-10)
val trainer2 = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight")
.setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false)
- .setMaxIter(90).setTol(1e-10)
+ .setMaxIter(220).setTol(1e-10)
val model1 = trainer1.fit(multinomialDataset)
val model2 = trainer2.fit(multinomialDataset)
@@ -2170,54 +2179,53 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
$`0`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- -0.50133383
+ -0.55325803
data.V3 .
data.V4 .
data.V5 .
- data.V6 0.08351653
+ data.V6 0.09074857
$`1`
5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- -0.3151913
- data.V3 -0.1058702
- data.V4 0.3183251
- data.V5 -0.1212969
- data.V6 -0.1629778
+ s0
+ -0.27291366
+ data.V3 -0.09093399
+ data.V4 0.28078251
+ data.V5 -0.12854559
+ data.V6 -0.18382494
$`2`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- 0.8165252
+ 0.8261717
data.V3 .
- data.V4 -0.3943069
+ data.V4 -0.4064444
data.V5 .
data.V6 .
-
coefficients
$`0`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- -0.38857157
+ -0.40016908
data.V3 .
data.V4 .
- data.V5 0.02384198
- data.V6 0.03127749
+ data.V5 0.02312769
+ data.V6 0.04159224
$`1`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- 0.62492165
- data.V3 -0.04949061
+ 0.62474768
+ data.V3 -0.03776471
data.V4 .
- data.V5 -0.18584462
- data.V6 -0.08952455
+ data.V5 -0.19588206
+ data.V6 -0.11187712
$`2`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
- -0.2363501
+ -0.2245786
data.V3 .
data.V4 .
data.V5 .
@@ -2226,15 +2234,15 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
*/
val coefficientsRStd = new DenseMatrix(3, 4, Array(
- 0.0, 0.0, 0.0, 0.08351653,
- -0.1058702, 0.3183251, -0.1212969, -0.1629778,
- 0.0, -0.3943069, 0.0, 0.0), isTransposed = true)
- val interceptsRStd = Vectors.dense(-0.50133383, -0.3151913, 0.8165252)
+ 0.0, 0.0, 0.0, 0.09074857,
+ -0.09093399, 0.28078251, -0.12854559, -0.18382494,
+ 0.0, -0.4064444, 0.0, 0.0), isTransposed = true)
+ val interceptsRStd = Vectors.dense(-0.55325803, -0.27291366, 0.8261717)
val coefficientsR = new DenseMatrix(3, 4, Array(
- 0.0, 0.0, 0.02384198, 0.03127749,
- -0.04949061, 0.0, -0.18584462, -0.08952455,
+ 0.0, 0.0, 0.02312769, 0.04159224,
+ -0.03776471, 0.0, -0.19588206, -0.11187712,
0.0, 0.0, 0.0, 0.0), isTransposed = true)
- val interceptsR = Vectors.dense(-0.38857157, 0.62492165, -0.2363501)
+ val interceptsR = Vectors.dense(-0.40016908, 0.62474768, -0.2245786)
assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.05)
assert(model1.interceptVector ~== interceptsRStd absTol 0.1)
@@ -2274,27 +2282,26 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
data.V3 .
data.V4 .
data.V5 .
- data.V6 0.03238285
+ data.V6 0.03418889
$`1`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
.
- data.V3 -0.1328284
- data.V4 0.4219321
- data.V5 -0.1247544
- data.V6 -0.1893318
+ data.V3 -0.1114779
+ data.V4 0.3992145
+ data.V5 -0.1315371
+ data.V6 -0.2107956
$`2`
5 x 1 sparse Matrix of class "dgCMatrix"
s0
.
- data.V3 0.004572312
+ data.V3 0.006442826
data.V4 .
data.V5 .
data.V6 .
-
coefficients
$`0`
5 x 1 sparse Matrix of class "dgCMatrix"
@@ -2310,9 +2317,9 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
s0
.
data.V3 .
- data.V4 0.14571623
- data.V5 -0.16456351
- data.V6 -0.05866264
+ data.V4 0.15710979
+ data.V5 -0.16871602
+ data.V6 -0.07928527
$`2`
5 x 1 sparse Matrix of class "dgCMatrix"
@@ -2326,13 +2333,13 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
*/
val coefficientsRStd = new DenseMatrix(3, 4, Array(
- 0.0, 0.0, 0.0, 0.03238285,
- -0.1328284, 0.4219321, -0.1247544, -0.1893318,
- 0.004572312, 0.0, 0.0, 0.0), isTransposed = true)
+ 0.0, 0.0, 0.0, 0.03418889,
+ -0.1114779, 0.3992145, -0.1315371, -0.2107956,
+ 0.006442826, 0.0, 0.0, 0.0), isTransposed = true)
val coefficientsR = new DenseMatrix(3, 4, Array(
0.0, 0.0, 0.0, 0.0,
- 0.0, 0.14571623, -0.16456351, -0.05866264,
+ 0.0, 0.15710979, -0.16871602, -0.07928527,
0.0, 0.0, 0.0, 0.0), isTransposed = true)
assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
index a5159bc..5d439a2 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
@@ -167,7 +167,7 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes
val model = new KMeans()
.setK(3)
- .setSeed(1)
+ .setSeed(42)
.setInitMode(MLlibKMeans.RANDOM)
.setTol(1e-6)
.setDistanceMeasure(DistanceMeasure.COSINE)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/PowerIterationClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/PowerIterationClusteringSuite.scala
index 97269ee..d3b8575 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/PowerIterationClusteringSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/PowerIterationClusteringSuite.scala
@@ -34,9 +34,9 @@ class PowerIterationClusteringSuite extends SparkFunSuite
@transient var data: Dataset[_] = _
final val r1 = 1.0
- final val n1 = 10
+ final val n1 = 80
final val r2 = 4.0
- final val n2 = 40
+ final val n2 = 80
override def beforeAll(): Unit = {
super.beforeAll()
@@ -222,7 +222,7 @@ class PowerIterationClusteringSuite extends SparkFunSuite
(0, 1),
(0, 2),
(3, 4)
- )).toDF("src", "dst")
+ )).toDF("src", "dst").repartition(1)
var assignments2 = new PowerIterationClustering()
.setInitMode("random")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
index 70d1177..d28f1f4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
@@ -65,7 +65,7 @@ class Word2VecSuite extends MLTest with DefaultReadWriteTest {
// These expectations are just magic values, characterizing the current
// behavior. The test needs to be updated to be more general, see SPARK-11502
- val magicExp = Vectors.dense(0.30153007534417237, -0.6833061711354689, 0.5116530778733167)
+ val magicExp = Vectors.dense(-0.11654884266582402, 0.3115301721475341, -0.6879349987615239)
testTransformer[(Seq[String], Vector)](docDF, model, "result", "expected") {
case Row(vector1: Vector, vector2: Vector) =>
assert(vector1 ~== magicExp absTol 1E-5, "Transformed vector is different with expected.")
@@ -98,9 +98,9 @@ class Word2VecSuite extends MLTest with DefaultReadWriteTest {
// These expectations are just magic values, characterizing the current
// behavior. The test needs to be updated to be more general, see SPARK-11502
val magicExpected = Seq(
- Vectors.dense(0.3326166272163391, -0.5603077411651611, -0.2309209555387497),
- Vectors.dense(0.32463887333869934, -0.9306551218032837, 1.393115520477295),
- Vectors.dense(-0.27150997519493103, 0.4372006058692932, -0.13465698063373566)
+ Vectors.dense(0.12662248313426971, 0.6108677387237549, -0.006755620241165161),
+ Vectors.dense(-0.3870747685432434, 0.023309476673603058, -1.567158818244934),
+ Vectors.dense(-0.08617416769266129, -0.09897610545158386, 0.6113300323486328)
)
realVectors.zip(magicExpected).foreach {
@@ -122,7 +122,7 @@ class Word2VecSuite extends MLTest with DefaultReadWriteTest {
.setSeed(42L)
.fit(docDF)
- val expected = Map(("b", 0.2608488929093532), ("c", -0.8271274846926078))
+ val expected = Map(("b", -0.024012837558984756), ("c", -0.19355152547359467))
val findSynonymsResult = model.findSynonyms("a", 2).rdd.map {
case Row(w: String, sim: Double) => (w, sim)
}.collectAsMap()
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
index 46fa376..f35c8c6 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
@@ -184,7 +184,7 @@ class GBTRegressorSuite extends MLTest with DefaultReadWriteTest {
val gbt = new GBTRegressor()
.setMaxDepth(3)
.setMaxIter(5)
- .setSeed(123)
+ .setSeed(42)
.setFeatureSubsetStrategy("all")
// In this data, feature 1 is very important.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 600a432..fc1284e 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -232,8 +232,8 @@ class GeneralizedLinearRegressionSuite extends MLTest with DefaultReadWriteTest
print(as.vector(coef(model)))
}
- [1] 2.2960999 0.8087933
- [1] 2.5002642 2.2000403 0.5999485
+ [1] 2.2958751 0.8088523
+ [1] 2.5009266 2.1997901 0.5999522
data <- read.csv("path", header=FALSE)
model1 <- glm(f1, family=gaussian(link=log), data=data, start=c(0,0))
@@ -241,8 +241,8 @@ class GeneralizedLinearRegressionSuite extends MLTest with DefaultReadWriteTest
print(as.vector(coef(model1)))
print(as.vector(coef(model2)))
- [1] 0.23069326 0.07993778
- [1] 0.25001858 0.22002452 0.05998789
+ [1] 0.23063118 0.07995495
+ [1] 0.25016124 0.21995737 0.05999335
data <- read.csv("path", header=FALSE)
for (formula in c(f1, f2)) {
@@ -250,17 +250,17 @@ class GeneralizedLinearRegressionSuite extends MLTest with DefaultReadWriteTest
print(as.vector(coef(model)))
}
- [1] 2.3010179 0.8198976
- [1] 2.4108902 2.2130248 0.6086152
+ [1] 2.3320341 0.8121904
+ [1] 2.2837064 2.2487147 0.6120262
*/
val expected = Seq(
- Vectors.dense(0.0, 2.2960999, 0.8087933),
- Vectors.dense(2.5002642, 2.2000403, 0.5999485),
- Vectors.dense(0.0, 0.23069326, 0.07993778),
- Vectors.dense(0.25001858, 0.22002452, 0.05998789),
- Vectors.dense(0.0, 2.3010179, 0.8198976),
- Vectors.dense(2.4108902, 2.2130248, 0.6086152))
+ Vectors.dense(0.0, 2.2958751, 0.8088523),
+ Vectors.dense(2.5009266, 2.1997901, 0.5999522),
+ Vectors.dense(0.0, 0.23063118, 0.07995495),
+ Vectors.dense(0.25016124, 0.21995737, 0.05999335),
+ Vectors.dense(0.0, 2.3320341, 0.8121904),
+ Vectors.dense(2.2837064, 2.2487147, 0.6120262))
import GeneralizedLinearRegression._
@@ -308,21 +308,21 @@ class GeneralizedLinearRegressionSuite extends MLTest with DefaultReadWriteTest
}
}
- [1] 0.0000000 2.2961005 0.8087932
- [1] 0.0000000 2.2130368 0.8309556
- [1] 0.0000000 1.7176137 0.9610657
- [1] 2.5002642 2.2000403 0.5999485
- [1] 3.1106389 2.0935142 0.5712711
- [1] 6.7597127 1.4581054 0.3994266
+ [1] 0.0000000 2.2958757 0.8088521
+ [1] 0.0000000 2.2128149 0.8310136
+ [1] 0.0000000 1.7174260 0.9611137
+ [1] 2.5009266 2.1997901 0.5999522
+ [1] 3.1113269 2.0932659 0.5712717
+ [1] 6.7604302 1.4578902 0.3994153
*/
val expected = Seq(
- Vectors.dense(0.0, 2.2961005, 0.8087932),
- Vectors.dense(0.0, 2.2130368, 0.8309556),
- Vectors.dense(0.0, 1.7176137, 0.9610657),
- Vectors.dense(2.5002642, 2.2000403, 0.5999485),
- Vectors.dense(3.1106389, 2.0935142, 0.5712711),
- Vectors.dense(6.7597127, 1.4581054, 0.3994266))
+ Vectors.dense(0.0, 2.2958757, 0.8088521),
+ Vectors.dense(0.0, 2.2128149, 0.8310136),
+ Vectors.dense(0.0, 1.7174260, 0.9611137),
+ Vectors.dense(2.5009266, 2.1997901, 0.5999522),
+ Vectors.dense(3.1113269, 2.0932659, 0.5712717),
+ Vectors.dense(6.7604302, 1.4578902, 0.3994153))
var idx = 0
for (fitIntercept <- Seq(false, true);
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala
index b33b86b..c25c89b 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala
@@ -47,9 +47,9 @@ class PowerIterationClusteringSuite extends SparkFunSuite with MLlibTestSparkCon
test("power iteration clustering") {
// Generate two circles following the example in the PIC paper.
val r1 = 1.0
- val n1 = 10
+ val n1 = 80
val r2 = 4.0
- val n2 = 10
+ val n2 = 80
val n = n1 + n2
val points = genCircle(r1, n1) ++ genCircle(r2, n2)
val similarities = for (i <- 1 until n; j <- 0 until i) yield {
@@ -81,9 +81,9 @@ class PowerIterationClusteringSuite extends SparkFunSuite with MLlibTestSparkCon
test("power iteration clustering on graph") {
// Generate two circles following the example in the PIC paper.
val r1 = 1.0
- val n1 = 10
+ val n1 = 80
val r2 = 4.0
- val n2 = 10
+ val n2 = 80
val n = n1 + n2
val points = genCircle(r1, n1) ++ genCircle(r2, n2)
val similarities = for (i <- 1 until n; j <- 0 until i) yield {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala
index fdaa098..a1ac10c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala
@@ -77,6 +77,7 @@ class StreamingKMeansSuite extends SparkFunSuite with TestSuiteBase {
val k = 2
val d = 5
val r = 0.1
+ val seed = 987654321
// create model with two clusters
val kMeans = new StreamingKMeans()
@@ -88,7 +89,7 @@ class StreamingKMeansSuite extends SparkFunSuite with TestSuiteBase {
Array(5.0, 5.0))
// generate random data for k-means
- val (input, centers) = StreamingKMeansDataGenerator(numPoints, numBatches, k, d, r, 42)
+ val (input, centers) = StreamingKMeansDataGenerator(numPoints, numBatches, k, d, r, seed)
// setup and run the model training
ssc = setupStreams(input, (inputDStream: DStream[Vector]) => {
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 864e2a3..6c9cf7b 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -1193,19 +1193,19 @@ class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams, JavaMLReada
... (3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9),
... (4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1),
... (5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)]
- >>> df = spark.createDataFrame(data).toDF("src", "dst", "weight")
+ >>> df = spark.createDataFrame(data).toDF("src", "dst", "weight").repartition(1)
>>> pic = PowerIterationClustering(k=2, maxIter=40, weightCol="weight")
>>> assignments = pic.assignClusters(df)
>>> assignments.sort(assignments.id).show(truncate=False)
+---+-------+
|id |cluster|
+---+-------+
- |0 |1 |
- |1 |1 |
- |2 |1 |
- |3 |1 |
- |4 |1 |
- |5 |0 |
+ |0 |0 |
+ |1 |0 |
+ |2 |0 |
+ |3 |0 |
+ |4 |0 |
+ |5 |1 |
+---+-------+
...
>>> pic_path = temp_path + "/pic"
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 3f9de9c..595ab18 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -3064,24 +3064,24 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has
+----+--------------------+
|word| vector|
+----+--------------------+
- | a|[0.09461779892444...|
- | b|[1.15474212169647...|
- | c|[-0.3794820010662...|
+ | a|[0.09511678665876...|
+ | b|[-1.2028766870498...|
+ | c|[0.30153277516365...|
+----+--------------------+
...
>>> model.findSynonymsArray("a", 2)
- [(u'b', 0.25053444504737854), (u'c', -0.6980510950088501)]
+ [(u'b', 0.015859870240092278), (u'c', -0.5680795907974243)]
>>> from pyspark.sql.functions import format_number as fmt
>>> model.findSynonyms("a", 2).select("word", fmt("similarity", 5).alias("similarity")).show()
+----+----------+
|word|similarity|
+----+----------+
- | b| 0.25053|
- | c| -0.69805|
+ | b| 0.01586|
+ | c| -0.56808|
+----+----------+
...
>>> model.transform(doc).head().model
- DenseVector([0.5524, -0.4995, -0.3599, 0.0241, 0.3461])
+ DenseVector([-0.4833, 0.1855, -0.273, -0.0509, -0.4769])
>>> word2vecPath = temp_path + "/word2vec"
>>> word2Vec.save(word2vecPath)
>>> loadedWord2Vec = Word2Vec.load(word2vecPath)
diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py
index 520d791..bf27164 100644
--- a/python/pyspark/ml/recommendation.py
+++ b/python/pyspark/ml/recommendation.py
@@ -79,27 +79,27 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha
>>> test = spark.createDataFrame([(0, 2), (1, 0), (2, 0)], ["user", "item"])
>>> predictions = sorted(model.transform(test).collect(), key=lambda r: r[0])
>>> predictions[0]
- Row(user=0, item=2, prediction=-0.13807615637779236)
+ Row(user=0, item=2, prediction=0.6929101347923279)
>>> predictions[1]
- Row(user=1, item=0, prediction=2.6258413791656494)
+ Row(user=1, item=0, prediction=3.47356915473938)
>>> predictions[2]
- Row(user=2, item=0, prediction=-1.5018409490585327)
+ Row(user=2, item=0, prediction=-0.8991986513137817)
>>> user_recs = model.recommendForAllUsers(3)
>>> user_recs.where(user_recs.user == 0)\
.select("recommendations.item", "recommendations.rating").collect()
- [Row(item=[0, 1, 2], rating=[3.910..., 1.992..., -0.138...])]
+ [Row(item=[0, 1, 2], rating=[3.910..., 1.997..., 0.692...])]
>>> item_recs = model.recommendForAllItems(3)
>>> item_recs.where(item_recs.item == 2)\
.select("recommendations.user", "recommendations.rating").collect()
- [Row(user=[2, 1, 0], rating=[4.901..., 3.981..., -0.138...])]
+ [Row(user=[2, 1, 0], rating=[4.892..., 3.991..., 0.692...])]
>>> user_subset = df.where(df.user == 2)
>>> user_subset_recs = model.recommendForUserSubset(user_subset, 3)
>>> user_subset_recs.select("recommendations.item", "recommendations.rating").first()
- Row(item=[2, 1, 0], rating=[4.901..., 1.056..., -1.501...])
+ Row(item=[2, 1, 0], rating=[4.892..., 1.076..., -0.899...])
>>> item_subset = df.where(df.item == 0)
>>> item_subset_recs = model.recommendForItemSubset(item_subset, 3)
>>> item_subset_recs.select("recommendations.user", "recommendations.rating").first()
- Row(user=[0, 1, 2], rating=[3.910..., 2.625..., -1.501...])
+ Row(user=[0, 1, 2], rating=[3.910..., 3.473..., -0.899...])
>>> als_path = temp_path + "/als"
>>> als.save(als_path)
>>> als2 = ALS.load(als_path)
diff --git a/python/pyspark/ml/tests/test_algorithms.py b/python/pyspark/ml/tests/test_algorithms.py
index 6082082..034eaed 100644
--- a/python/pyspark/ml/tests/test_algorithms.py
+++ b/python/pyspark/ml/tests/test_algorithms.py
@@ -83,7 +83,7 @@ class MultilayerPerceptronClassifierTest(SparkSessionTestCase):
result = model.transform(test).head()
expected_prediction = 2.0
expected_probability = [0.0, 0.0, 1.0]
- expected_rawPrediction = [57.3955, -124.5462, 67.9943]
+ expected_rawPrediction = [-11.6081922998, -8.15827998691, 22.17757045]
self.assertTrue(result.prediction, expected_prediction)
self.assertTrue(np.allclose(result.probability, expected_probability, atol=1E-4))
self.assertTrue(np.allclose(result.rawPrediction, expected_rawPrediction, atol=1E-4))
diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
index 1f4abf5..be7b8da 100644
--- a/python/pyspark/ml/tuning.py
+++ b/python/pyspark/ml/tuning.py
@@ -504,15 +504,15 @@ class TrainValidationSplit(Estimator, ValidatorParams, HasParallelism, HasCollec
... (Vectors.dense([0.5]), 0.0),
... (Vectors.dense([0.6]), 1.0),
... (Vectors.dense([1.0]), 1.0)] * 10,
- ... ["features", "label"])
+ ... ["features", "label"]).repartition(1)
>>> lr = LogisticRegression()
>>> grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
>>> evaluator = BinaryClassificationEvaluator()
>>> tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
- ... parallelism=2)
+ ... parallelism=1, seed=42)
>>> tvsModel = tvs.fit(dataset)
>>> evaluator.evaluate(tvsModel.transform(dataset))
- 0.8333...
+ 0.833...
.. versionadded:: 2.0.0
"""
diff --git a/python/pyspark/mllib/recommendation.py b/python/pyspark/mllib/recommendation.py
index 3d4eae8..3dd7cb2 100644
--- a/python/pyspark/mllib/recommendation.py
+++ b/python/pyspark/mllib/recommendation.py
@@ -100,16 +100,16 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader):
>>> users_for_products[0]
(1, (Rating(user=2, product=1, rating=...),))
- >>> model = ALS.train(ratings, 1, nonnegative=True, seed=10)
+ >>> model = ALS.train(ratings, 1, nonnegative=True, seed=123456789)
>>> model.predict(2, 2)
3.73...
>>> df = sqlContext.createDataFrame([Rating(1, 1, 1.0), Rating(1, 2, 2.0), Rating(2, 1, 2.0)])
- >>> model = ALS.train(df, 1, nonnegative=True, seed=10)
+ >>> model = ALS.train(df, 1, nonnegative=True, seed=123456789)
>>> model.predict(2, 2)
3.73...
- >>> model = ALS.trainImplicit(ratings, 1, nonnegative=True, seed=10)
+ >>> model = ALS.trainImplicit(ratings, 1, nonnegative=True, seed=123456789)
>>> model.predict(2, 2)
0.4...
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 8227e82..58d74f5 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -795,9 +795,9 @@ class DataFrame(object):
>>> df = spark.range(10)
>>> df.sample(0.5, 3).count()
- 4
+ 7
>>> df.sample(fraction=0.5, seed=3).count()
- 4
+ 7
>>> df.sample(withReplacement=True, fraction=0.5, seed=3).count()
1
>>> df.sample(1.0).count()
@@ -865,8 +865,8 @@ class DataFrame(object):
+---+-----+
|key|count|
+---+-----+
- | 0| 5|
- | 1| 9|
+ | 0| 3|
+ | 1| 6|
+---+-----+
>>> dataset.sampleBy(col("key"), fractions={2: 1.0}, seed=0).count()
33
@@ -898,10 +898,10 @@ class DataFrame(object):
>>> splits = df4.randomSplit([1.0, 2.0], 24)
>>> splits[0].count()
- 1
+ 2
>>> splits[1].count()
- 3
+ 2
"""
for w in weights:
if w < 0.0:
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index bc28c9d..6ae2357 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -584,8 +584,8 @@ def rand(seed=None):
.. note:: The function is non-deterministic in general case.
>>> df.withColumn('rand', rand(seed=42) * 3).collect()
- [Row(age=2, name=u'Alice', rand=1.1568609015300986),
- Row(age=5, name=u'Bob', rand=1.403379671529166)]
+ [Row(age=2, name=u'Alice', rand=2.4052597283576684),
+ Row(age=5, name=u'Bob', rand=2.3913904055683974)]
"""
sc = SparkContext._active_spark_context
if seed is not None:
@@ -604,8 +604,8 @@ def randn(seed=None):
.. note:: The function is non-deterministic in general case.
>>> df.withColumn('randn', randn(seed=42)).collect()
- [Row(age=2, name=u'Alice', randn=-0.7556247885860078),
- Row(age=5, name=u'Bob', randn=-0.0861619008451133)]
+ [Row(age=2, name=u'Alice', randn=1.1027054481455365),
+ Row(age=5, name=u'Bob', randn=0.7400395449950132)]
"""
sc = SparkContext._active_spark_context
if seed is not None:
diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py
index b777573..273749e 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -83,9 +83,9 @@ class FunctionsTests(ReusedSQLTestCase):
self.assertTrue(abs(corr - 0.95734012) < 1e-6)
def test_sampleby(self):
- df = self.sc.parallelize([Row(a=i, b=(i % 3)) for i in range(10)]).toDF()
+ df = self.sc.parallelize([Row(a=i, b=(i % 3)) for i in range(100)]).toDF()
sampled = df.stat.sampleBy(u"b", fractions={0: 0.5, 1: 0.5}, seed=0)
- self.assertTrue(sampled.count() == 3)
+ self.assertTrue(sampled.count() == 35)
def test_cov(self):
df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF()
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala
index 752c9d5..469c24b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala
@@ -17,25 +17,21 @@
package org.apache.spark.sql.catalyst.expressions
-import org.scalatest.Matchers._
-
import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{IntegerType, LongType}
class RandomSuite extends SparkFunSuite with ExpressionEvalHelper {
test("random") {
- checkDoubleEvaluation(Rand(30), 0.31429268272540556 +- 0.001)
- checkDoubleEvaluation(Randn(30), -0.4798519469521663 +- 0.001)
+ checkEvaluation(Rand(30), 0.2762195585886885)
+ checkEvaluation(Randn(30), -1.0451987154313813)
- checkDoubleEvaluation(
- new Rand(Literal.create(null, LongType)), 0.8446490682263027 +- 0.001)
- checkDoubleEvaluation(
- new Randn(Literal.create(null, IntegerType)), 1.1164209726833079 +- 0.001)
+ checkEvaluation(new Rand(Literal.create(null, LongType)), 0.7604953758285915)
+ checkEvaluation(new Randn(Literal.create(null, IntegerType)), 1.6034991609278433)
}
test("SPARK-9127 codegen with long seed") {
- checkDoubleEvaluation(Rand(5419823303878592871L), 0.2304755080444375 +- 0.001)
- checkDoubleEvaluation(Randn(5419823303878592871L), -1.2824262718225607 +- 0.001)
+ checkEvaluation(Rand(5419823303878592871L), 0.7145363364564755)
+ checkEvaluation(Randn(5419823303878592871L), 0.7816815274533012)
}
}
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out
index cf5add6..09e2c63 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out
@@ -141,12 +141,12 @@ from
-- !query 13 schema
struct<a:int,rand(0):double,sum(b):bigint>
-- !query 13 output
-1 0.4048454303385226 2
-1 0.8446490682263027 1
-2 0.5871875724155838 1
-2 0.8865128837019473 2
-3 0.742083829230211 1
-3 0.9179913208300406 2
+1 0.5234194256885571 2
+1 0.7604953758285915 1
+2 0.0953472826424725 1
+2 0.3163249920547614 2
+3 0.2710259815484829 2
+3 0.7141011170991605 1
-- !query 14
diff --git a/sql/core/src/test/resources/sql-tests/results/random.sql.out b/sql/core/src/test/resources/sql-tests/results/random.sql.out
index bca6732..acd0609 100644
--- a/sql/core/src/test/resources/sql-tests/results/random.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/random.sql.out
@@ -7,7 +7,7 @@ SELECT rand(0)
-- !query 0 schema
struct<rand(0):double>
-- !query 0 output
-0.8446490682263027
+0.7604953758285915
-- !query 1
@@ -15,7 +15,7 @@ SELECT rand(cast(3 / 7 AS int))
-- !query 1 schema
struct<rand(CAST((CAST(3 AS DOUBLE) / CAST(7 AS DOUBLE)) AS INT)):double>
-- !query 1 output
-0.8446490682263027
+0.7604953758285915
-- !query 2
@@ -23,7 +23,7 @@ SELECT rand(NULL)
-- !query 2 schema
struct<rand(CAST(NULL AS INT)):double>
-- !query 2 output
-0.8446490682263027
+0.7604953758285915
-- !query 3
@@ -31,7 +31,7 @@ SELECT rand(cast(NULL AS int))
-- !query 3 schema
struct<rand(CAST(NULL AS INT)):double>
-- !query 3 output
-0.8446490682263027
+0.7604953758285915
-- !query 4
@@ -48,7 +48,7 @@ SELECT randn(0L)
-- !query 5 schema
struct<randn(0):double>
-- !query 5 output
-1.1164209726833079
+1.6034991609278433
-- !query 6
@@ -56,7 +56,7 @@ SELECT randn(cast(3 / 7 AS long))
-- !query 6 schema
struct<randn(CAST((CAST(3 AS DOUBLE) / CAST(7 AS DOUBLE)) AS BIGINT)):double>
-- !query 6 output
-1.1164209726833079
+1.6034991609278433
-- !query 7
@@ -64,7 +64,7 @@ SELECT randn(NULL)
-- !query 7 schema
struct<randn(CAST(NULL AS INT)):double>
-- !query 7 output
-1.1164209726833079
+1.6034991609278433
-- !query 8
@@ -72,7 +72,7 @@ SELECT randn(cast(NULL AS long))
-- !query 8 schema
struct<randn(CAST(NULL AS BIGINT)):double>
-- !query 8 output
-1.1164209726833079
+1.6034991609278433
-- !query 9
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
index 589873b..2a74bfe 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -47,7 +47,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
val data = sparkContext.parallelize(1 to n, 2).toDF("id")
checkAnswer(
data.sample(withReplacement = false, 0.05, seed = 13),
- Seq(3, 17, 27, 58, 62).map(Row(_))
+ Seq(37, 8, 90).map(Row(_))
)
}
@@ -371,7 +371,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
val sampled = df.stat.sampleBy("key", Map(0 -> 0.1, 1 -> 0.2), 0L)
checkAnswer(
sampled.groupBy("key").count().orderBy("key"),
- Seq(Row(0, 6), Row(1, 11)))
+ Seq(Row(0, 1), Row(1, 6)))
}
test("sampleBy one column") {
@@ -379,7 +379,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
val sampled = df.stat.sampleBy($"key", Map(0 -> 0.1, 1 -> 0.2), 0L)
checkAnswer(
sampled.groupBy("key").count().orderBy("key"),
- Seq(Row(0, 6), Row(1, 11)))
+ Seq(Row(0, 1), Row(1, 6)))
}
test("sampleBy multiple columns") {
@@ -389,7 +389,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
struct($"name", $"key"), Map(Row("Foo", 0) -> 0.1, Row("Foo", 1) -> 0.2), 0L)
checkAnswer(
sampled.groupBy("key").count().orderBy("key"),
- Seq(Row(0, 6), Row(1, 11)))
+ Seq(Row(0, 1), Row(1, 6)))
}
// This test case only verifies that `DataFrame.countMinSketch()` methods do return
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 050699d..6e35b52 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -618,7 +618,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
val data = sparkContext.parallelize(1 to n, 2).toDS()
checkDataset(
data.sample(withReplacement = false, 0.05, seed = 13),
- 3, 17, 27, 58, 62)
+ 8, 37, 90)
}
test("sample fraction should not be negative with replacement") {
@@ -650,9 +650,10 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
}
test("SPARK-16686: Dataset.sample with seed results shouldn't depend on downstream usage") {
+ val a = 7
val simpleUdf = udf((n: Int) => {
- require(n != 1, "simpleUdf shouldn't see id=1!")
- 1
+ require(n != a, s"simpleUdf shouldn't see id=$a!")
+ a
})
val df = Seq(
@@ -668,10 +669,10 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
(9, "string9")
).toDF("id", "stringData")
val sampleDF = df.sample(false, 0.7, 50)
- // After sampling, sampleDF doesn't contain id=1.
- assert(!sampleDF.select("id").as[Int].collect.contains(1))
- // simpleUdf should not encounter id=1.
- checkAnswer(sampleDF.select(simpleUdf($"id")), List.fill(sampleDF.count.toInt)(Row(1)))
+ // After sampling, sampleDF doesn't contain id=a.
+ assert(!sampleDF.select("id").as[Int].collect.contains(a))
+ // simpleUdf should not encounter id=a.
+ checkAnswer(sampleDF.select(simpleUdf($"id")), List.fill(sampleDF.count.toInt)(Row(a)))
}
test("SPARK-11436: we should rebind right encoder when join 2 datasets") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/TestCsvData.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/TestCsvData.scala
index 3e20cc4..7999331 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/TestCsvData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/TestCsvData.scala
@@ -24,8 +24,7 @@ private[csv] trait TestCsvData {
def sampledTestData: Dataset[String] = {
spark.range(0, 100, 1).map { index =>
- val predefinedSample = Set[Long](2, 8, 15, 27, 30, 34, 35, 37, 44, 46,
- 57, 62, 68, 72)
+ val predefinedSample = Set[Long](3, 18, 20, 24, 50, 60, 87, 99)
if (predefinedSample.contains(index)) {
index.toString
} else {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala
index 6e9559e..1750333 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala
@@ -236,8 +236,7 @@ private[json] trait TestJsonData {
def sampledTestData: Dataset[String] = {
spark.range(0, 100, 1).map { index =>
- val predefinedSample = Set[Long](2, 8, 15, 27, 30, 34, 35, 37, 44, 46,
- 57, 62, 68, 72)
+ val predefinedSample = Set[Long](3, 18, 20, 24, 50, 60, 87, 99)
if (predefinedSample.contains(index)) {
s"""{"f1":${index.toString}}"""
} else {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org