You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/02/17 23:00:14 UTC
incubator-systemml git commit: [SYSTEMML-1238] Updated the default
parameters of mllearn to match that of scikit learn.
Repository: incubator-systemml
Updated Branches:
refs/heads/master cf92e8417 -> 9d0087cbb
[SYSTEMML-1238] Updated the default parameters of mllearn to match that of
scikit learn.
- Also updated the test to compare our algorithm to scikit-learn.
Closes #398.
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/9d0087cb
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/9d0087cb
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/9d0087cb
Branch: refs/heads/master
Commit: 9d0087cbbd250c9b486923555b450602f816cf19
Parents: cf92e84
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Fri Feb 17 14:54:23 2017 -0800
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Fri Feb 17 14:59:49 2017 -0800
----------------------------------------------------------------------
docs/algorithms-regression.md | 8 +-
docs/beginners-guide-python.md | 2 +-
docs/python-reference.md | 6 +-
.../spark/utils/RDDConverterUtilsExt.java | 2 +-
src/main/python/systemml/mllearn/estimators.py | 34 ++++----
src/main/python/tests/test_mllearn_df.py | 56 +++++++------
src/main/python/tests/test_mllearn_numpy.py | 87 ++++++++++++++------
7 files changed, 124 insertions(+), 71 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9d0087cb/docs/algorithms-regression.md
----------------------------------------------------------------------
diff --git a/docs/algorithms-regression.md b/docs/algorithms-regression.md
index 992862e..80b38a3 100644
--- a/docs/algorithms-regression.md
+++ b/docs/algorithms-regression.md
@@ -83,8 +83,8 @@ efficient when the number of features $m$ is relatively small
<div data-lang="Python" markdown="1">
{% highlight python %}
from systemml.mllearn import LinearRegression
-# C = 1/reg
-lr = LinearRegression(sqlCtx, fit_intercept=True, C=1.0, solver='direct-solve')
+# C = 1/reg (to disable regularization, use float("inf"))
+lr = LinearRegression(sqlCtx, fit_intercept=True, normalize=False, C=float("inf"), solver='direct-solve')
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
y_test = lr.fit(X_train, y_train)
# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
@@ -125,8 +125,8 @@ y_test = lr.fit(df_train)
<div data-lang="Python" markdown="1">
{% highlight python %}
from systemml.mllearn import LinearRegression
-# C = 1/reg
-lr = LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg')
+# C = 1/reg (to disable regularization, use float("inf"))
+lr = LinearRegression(sqlCtx, fit_intercept=True, normalize=False, max_iter=100, tol=0.000001, C=float("inf"), solver='newton-cg')
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrames or SciPy Sparse matrices
y_test = lr.fit(X_train, y_train)
# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9d0087cb/docs/beginners-guide-python.md
----------------------------------------------------------------------
diff --git a/docs/beginners-guide-python.md b/docs/beginners-guide-python.md
index 4d1b098..ffab09e 100644
--- a/docs/beginners-guide-python.md
+++ b/docs/beginners-guide-python.md
@@ -228,7 +228,7 @@ X_test = diabetes_X[-20:]
y_train = diabetes.target[:-20]
y_test = diabetes.target[-20:]
# Create linear regression object
-regr = LinearRegression(sqlCtx, fit_intercept=True, C=1, solver='direct-solve')
+regr = LinearRegression(sqlCtx, fit_intercept=True, C=float("inf"), solver='direct-solve')
# Train the model using the training sets
regr.fit(X_train, y_train)
y_predicted = regr.predict(X_test)
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9d0087cb/docs/python-reference.md
----------------------------------------------------------------------
diff --git a/docs/python-reference.md b/docs/python-reference.md
index 65dcb5c..8d38598 100644
--- a/docs/python-reference.md
+++ b/docs/python-reference.md
@@ -731,7 +731,7 @@ LogisticRegression score: 0.922222
### Reference documentation
- *class*`systemml.mllearn.estimators.LinearRegression`(*sqlCtx*, *fit\_intercept=True*, *max\_iter=100*, *tol=1e-06*, *C=1.0*, *solver='newton-cg'*, *transferUsingDF=False*)(#systemml.mllearn.estimators.LinearRegression "Permalink to this definition")
+ *class*`systemml.mllearn.estimators.LinearRegression`(*sqlCtx*, *fit\_intercept=True*, *normalize=False*, *max\_iter=100*, *tol=1e-06*, *C=float("inf")*, *solver='newton-cg'*, *transferUsingDF=False*)(#systemml.mllearn.estimators.LinearRegression "Permalink to this definition")
: Bases: `systemml.mllearn.estimators.BaseSystemMLRegressor`{.xref .py
.py-class .docutils .literal}
@@ -760,7 +760,7 @@ LogisticRegression score: 0.922222
>>> # The mean square error
>>> print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2))
- *class*`systemml.mllearn.estimators.LogisticRegression`(*sqlCtx*, *penalty='l2'*, *fit\_intercept=True*, *max\_iter=100*, *max\_inner\_iter=0*, *tol=1e-06*, *C=1.0*, *solver='newton-cg'*, *transferUsingDF=False*)(#systemml.mllearn.estimators.LogisticRegression "Permalink to this definition")
+ *class*`systemml.mllearn.estimators.LogisticRegression`(*sqlCtx*, *penalty='l2'*, *fit\_intercept=True*, *normalize=False*, *max\_iter=100*, *max\_inner\_iter=0*, *tol=1e-06*, *C=1.0*, *solver='newton-cg'*, *transferUsingDF=False*)(#systemml.mllearn.estimators.LogisticRegression "Permalink to this definition")
: Bases: `systemml.mllearn.estimators.BaseSystemMLClassifier`{.xref
.py .py-class .docutils .literal}
@@ -817,7 +817,7 @@ LogisticRegression score: 0.922222
>>> prediction = model.transform(test)
>>> prediction.show()
- *class*`systemml.mllearn.estimators.SVM`(*sqlCtx*, *fit\_intercept=True*, *max\_iter=100*, *tol=1e-06*, *C=1.0*, *is\_multi\_class=False*, *transferUsingDF=False*)(#systemml.mllearn.estimators.SVM "Permalink to this definition")
+ *class*`systemml.mllearn.estimators.SVM`(*sqlCtx*, *fit\_intercept=True*, *normalize=False*, *max\_iter=100*, *tol=1e-06*, *C=1.0*, *is\_multi\_class=False*, *transferUsingDF=False*)(#systemml.mllearn.estimators.SVM "Permalink to this definition")
: Bases: `systemml.mllearn.estimators.BaseSystemMLClassifier`{.xref
.py .py-class .docutils .literal}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9d0087cb/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
index dea5601..cdf090d 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
@@ -195,7 +195,7 @@ public class RDDConverterUtilsExt
long limit = mb.getNumRows()*mb.getNumColumns();
int times = Double.SIZE / Byte.SIZE;
- if( limit * times > Integer.MAX_VALUE )
+ if( limit > Integer.MAX_VALUE / times )
throw new DMLRuntimeException("MatrixBlock of size " + limit + " cannot be converted to dense numpy array");
ret = new byte[(int) (limit * times)];
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9d0087cb/src/main/python/systemml/mllearn/estimators.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/mllearn/estimators.py b/src/main/python/systemml/mllearn/estimators.py
index c4eaf3d..4188ade 100644
--- a/src/main/python/systemml/mllearn/estimators.py
+++ b/src/main/python/systemml/mllearn/estimators.py
@@ -294,7 +294,7 @@ class LogisticRegression(BaseSystemMLClassifier):
"""
- def __init__(self, sparkSession, penalty='l2', fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
+ def __init__(self, sparkSession, penalty='l2', fit_intercept=True, normalize=False, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
"""
Performs both binomial and multinomial logistic regression.
@@ -303,10 +303,11 @@ class LogisticRegression(BaseSystemMLClassifier):
sparkSession: PySpark SparkSession
penalty: Only 'l2' supported
fit_intercept: Specifies whether to add intercept or not (default: True)
+ normalize: This parameter is ignored when fit_intercept is set to False. (default: False)
max_iter: Maximum number of outer (Fisher scoring) iterations (default: 100)
max_inner_iter: Maximum number of inner (conjugate gradient) iterations, or 0 if no maximum limit provided (default: 0)
tol: Tolerance used in the convergence criterion (default: 0.000001)
- C: 1/regularization parameter (default: 1.0)
+ C: 1/regularization parameter (default: 1.0 similar to scikit-learn. To disable regularization, please use float("inf"))
solver: Only 'newton-cg' solver supported
"""
self.sparkSession = sparkSession
@@ -316,12 +317,11 @@ class LogisticRegression(BaseSystemMLClassifier):
self.estimator = self.sc._jvm.org.apache.sysml.api.ml.LogisticRegression(self.uid, self.sc._jsc.sc())
self.estimator.setMaxOuterIter(max_iter)
self.estimator.setMaxInnerIter(max_inner_iter)
- if C <= 0:
- raise Exception('C has to be positive')
- reg = 1.0 / C
+ reg = 0.0 if C == float("inf") else 1.0 / C
+ icpt = 2 if fit_intercept == True and normalize == True else int(fit_intercept)
self.estimator.setRegParam(reg)
self.estimator.setTol(tol)
- self.estimator.setIcpt(int(fit_intercept))
+ self.estimator.setIcpt(icpt)
self.transferUsingDF = transferUsingDF
self.setOutputRawPredictionsToFalse = True
if penalty != 'l2':
@@ -361,7 +361,7 @@ class LinearRegression(BaseSystemMLRegressor):
"""
- def __init__(self, sparkSession, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
+ def __init__(self, sparkSession, fit_intercept=True, normalize=False, max_iter=100, tol=0.000001, C=float("inf"), solver='newton-cg', transferUsingDF=False):
"""
Performs linear regression to model the relationship between one numerical response variable and one or more explanatory (feature) variables.
@@ -369,9 +369,10 @@ class LinearRegression(BaseSystemMLRegressor):
----------
sparkSession: PySpark SparkSession
fit_intercept: Specifies whether to add intercept or not (default: True)
+ normalize: If True, the regressors X will be normalized before regression. This parameter is ignored when fit_intercept is set to False. (default: False)
max_iter: Maximum number of conjugate gradient iterations, or 0 if no maximum limit provided (default: 100)
tol: Tolerance used in the convergence criterion (default: 0.000001)
- C: 1/regularization parameter (default: 1.0)
+ C: 1/regularization parameter (default: float("inf") as scikit learn doesnot support regularization by default)
solver: Supports either 'newton-cg' or 'direct-solve' (default: 'newton-cg').
Depending on the size and the sparsity of the feature matrix, one or the other solver may be more efficient.
'direct-solve' solver is more efficient when the number of features is relatively small (m < 1000) and
@@ -386,12 +387,11 @@ class LinearRegression(BaseSystemMLRegressor):
else:
raise Exception('Only newton-cg solver supported')
self.estimator.setMaxIter(max_iter)
- if C <= 0:
- raise Exception('C has to be positive')
- reg = 1.0 / C
+ reg = 0.0 if C == float("inf") else 1.0 / C
+ icpt = 2 if fit_intercept == True and normalize == True else int(fit_intercept)
self.estimator.setRegParam(reg)
self.estimator.setTol(tol)
- self.estimator.setIcpt(int(fit_intercept))
+ self.estimator.setIcpt(icpt)
self.transferUsingDF = transferUsingDF
self.setOutputRawPredictionsToFalse = False
@@ -421,7 +421,7 @@ class SVM(BaseSystemMLClassifier):
"""
- def __init__(self, sparkSession, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False, transferUsingDF=False):
+ def __init__(self, sparkSession, fit_intercept=True, normalize=False, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False, transferUsingDF=False):
"""
Performs both binary-class and multiclass SVM (Support Vector Machines).
@@ -429,9 +429,10 @@ class SVM(BaseSystemMLClassifier):
----------
sparkSession: PySpark SparkSession
fit_intercept: Specifies whether to add intercept or not (default: True)
+ normalize: This parameter is ignored when fit_intercept is set to False. (default: False)
max_iter: Maximum number iterations (default: 100)
tol: Tolerance used in the convergence criterion (default: 0.000001)
- C: 1/regularization parameter (default: 1.0)
+ C: 1/regularization parameter (default: 1.0 similar to scikit-learn. To disable regularization, please use float("inf"))
is_multi_class: Specifies whether to use binary-class SVM or multi-class SVM algorithm (default: False)
"""
self.sparkSession = sparkSession
@@ -442,10 +443,11 @@ class SVM(BaseSystemMLClassifier):
self.estimator.setMaxIter(max_iter)
if C <= 0:
raise Exception('C has to be positive')
- reg = 1.0 / C
+ reg = 0.0 if C == float("inf") else 1.0 / C
+ icpt = 2 if fit_intercept == True and normalize == True else int(fit_intercept)
self.estimator.setRegParam(reg)
self.estimator.setTol(tol)
- self.estimator.setIcpt(int(fit_intercept))
+ self.estimator.setIcpt(icpt)
self.transferUsingDF = transferUsingDF
self.setOutputRawPredictionsToFalse = False
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9d0087cb/src/main/python/tests/test_mllearn_df.py
----------------------------------------------------------------------
diff --git a/src/main/python/tests/test_mllearn_df.py b/src/main/python/tests/test_mllearn_df.py
index da49953..d949f4e 100644
--- a/src/main/python/tests/test_mllearn_df.py
+++ b/src/main/python/tests/test_mllearn_df.py
@@ -40,7 +40,8 @@ from pyspark.sql import SparkSession
from sklearn import datasets, metrics, neighbors
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
-
+from sklearn import linear_model
+from sklearn.metrics import accuracy_score, r2_score
from systemml.mllearn import LinearRegression, LogisticRegression, NaiveBayes, SVM
sc = SparkContext()
@@ -61,20 +62,40 @@ class TestMLLearn(unittest.TestCase):
y_test = y_digits[int(.9 * n_samples):]
# Convert to DataFrame for i/o: current way to transfer data
logistic = LogisticRegression(sparkSession, transferUsingDF=True)
- score = logistic.fit(X_train, y_train).score(X_test, y_test)
- self.failUnless(score > 0.9)
+ logistic.fit(X_train, y_train)
+ mllearn_predicted = logistic.predict(X_test)
+ sklearn_logistic = linear_model.LogisticRegression()
+ sklearn_logistic.fit(X_train, y_train)
+ self.failUnless(accuracy_score(sklearn_logistic.predict(X_test), mllearn_predicted) > 0.95) # We are comparable to a similar algorithm in scikit learn
- def test_linear_regression_sk2(self):
+ def test_linear_regression(self):
diabetes = datasets.load_diabetes()
diabetes_X = diabetes.data[:, np.newaxis, 2]
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]
- regr = LinearRegression(sparkSession, transferUsingDF=True)
+ regr = LinearRegression(sparkSession, solver='direct-solve', transferUsingDF=True)
regr.fit(diabetes_X_train, diabetes_y_train)
- score = regr.score(diabetes_X_test, diabetes_y_test)
- self.failUnless(score > 0.4) # TODO: Improve r2-score (may be I am using it incorrectly)
+ mllearn_predicted = regr.predict(diabetes_X_test)
+ sklearn_regr = linear_model.LinearRegression()
+ sklearn_regr.fit(diabetes_X_train, diabetes_y_train)
+ self.failUnless(r2_score(sklearn_regr.predict(diabetes_X_test), mllearn_predicted) > 0.95) # We are comparable to a similar algorithm in scikit learn
+
+ def test_linear_regression_cg(self):
+ diabetes = datasets.load_diabetes()
+ diabetes_X = diabetes.data[:, np.newaxis, 2]
+ diabetes_X_train = diabetes_X[:-20]
+ diabetes_X_test = diabetes_X[-20:]
+ diabetes_y_train = diabetes.target[:-20]
+ diabetes_y_test = diabetes.target[-20:]
+ regr = LinearRegression(sparkSession, solver='newton-cg', transferUsingDF=True)
+ regr.fit(diabetes_X_train, diabetes_y_train)
+ mllearn_predicted = regr.predict(diabetes_X_test)
+ sklearn_regr = linear_model.LinearRegression()
+ sklearn_regr.fit(diabetes_X_train, diabetes_y_train)
+ self.failUnless(r2_score(sklearn_regr.predict(diabetes_X_test), mllearn_predicted) > 0.95) # We are comparable to a similar algorithm in scikit learn
+
def test_svm_sk2(self):
digits = datasets.load_digits()
@@ -86,22 +107,11 @@ class TestMLLearn(unittest.TestCase):
X_test = X_digits[int(.9 * n_samples):]
y_test = y_digits[int(.9 * n_samples):]
svm = SVM(sparkSession, is_multi_class=True, transferUsingDF=True)
- score = svm.fit(X_train, y_train).score(X_test, y_test)
- self.failUnless(score > 0.9)
-
- #def test_naive_bayes_sk2(self):
- # categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
- # newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
- # newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
- # vectorizer = TfidfVectorizer()
- # # Both vectors and vectors_test are SciPy CSR matrix
- # vectors = vectorizer.fit_transform(newsgroups_train.data)
- # vectors_test = vectorizer.transform(newsgroups_test.data)
- # nb = NaiveBayes(sparkSession)
- # nb.fit(vectors, newsgroups_train.target)
- # pred = nb.predict(vectors_test)
- # score = metrics.f1_score(newsgroups_test.target, pred, average='weighted')
- # self.failUnless(score > 0.8)
+ mllearn_predicted = svm.fit(X_train, y_train).predict(X_test)
+ from sklearn import linear_model, svm
+ clf = svm.LinearSVC()
+ sklearn_predicted = clf.fit(X_train, y_train).predict(X_test)
+ self.failUnless(accuracy_score(sklearn_predicted, mllearn_predicted) > 0.95 )
if __name__ == '__main__':
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9d0087cb/src/main/python/tests/test_mllearn_numpy.py
----------------------------------------------------------------------
diff --git a/src/main/python/tests/test_mllearn_numpy.py b/src/main/python/tests/test_mllearn_numpy.py
index 925554f..faa4d32 100644
--- a/src/main/python/tests/test_mllearn_numpy.py
+++ b/src/main/python/tests/test_mllearn_numpy.py
@@ -40,11 +40,26 @@ from pyspark.sql import SparkSession
from sklearn import datasets, metrics, neighbors
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
-
+from sklearn.metrics import accuracy_score, r2_score
from systemml.mllearn import LinearRegression, LogisticRegression, NaiveBayes, SVM
+from sklearn import linear_model
sc = SparkContext()
sparkSession = SparkSession.builder.getOrCreate()
+import os
+
+def writeColVector(X, fileName):
+ fileName = os.path.join(os.getcwd(), fileName)
+ X.tofile(fileName, sep='\n')
+ metaDataFileContent = '{ "data_type": "matrix", "value_type": "double", "rows":' + str(len(X)) + ', "cols": 1, "nnz": -1, "format": "csv", "author": "systemml-tests", "created": "0000-00-00 00:00:00 PST" }'
+ with open(fileName+'.mtd', 'w') as text_file:
+ text_file.write(metaDataFileContent)
+
+def deleteIfExists(fileName):
+ try:
+ os.remove(fileName)
+ except OSError:
+ pass
# Currently not integrated with JUnit test
# ~/spark-1.6.1-scala-2.11/bin/spark-submit --master local[*] --driver-class-path SystemML.jar test.py
@@ -59,8 +74,11 @@ class TestMLLearn(unittest.TestCase):
X_test = X_digits[int(.9 * n_samples):]
y_test = y_digits[int(.9 * n_samples):]
logistic = LogisticRegression(sparkSession)
- score = logistic.fit(X_train, y_train).score(X_test, y_test)
- self.failUnless(score > 0.9)
+ logistic.fit(X_train, y_train)
+ mllearn_predicted = logistic.predict(X_test)
+ sklearn_logistic = linear_model.LogisticRegression()
+ sklearn_logistic.fit(X_train, y_train)
+ self.failUnless(accuracy_score(sklearn_logistic.predict(X_test), mllearn_predicted) > 0.95) # We are comparable to a similar algorithm in scikit learn
def test_logistic_mlpipeline(self):
training = sparkSession.createDataFrame([
@@ -101,11 +119,27 @@ class TestMLLearn(unittest.TestCase):
diabetes_X_test = diabetes_X[-20:]
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]
- regr = LinearRegression(sparkSession)
+ regr = LinearRegression(sparkSession, solver='direct-solve')
regr.fit(diabetes_X_train, diabetes_y_train)
- score = regr.score(diabetes_X_test, diabetes_y_test)
- self.failUnless(score > 0.4) # TODO: Improve r2-score (may be I am using it incorrectly)
+ mllearn_predicted = regr.predict(diabetes_X_test)
+ sklearn_regr = linear_model.LinearRegression()
+ sklearn_regr.fit(diabetes_X_train, diabetes_y_train)
+ self.failUnless(r2_score(sklearn_regr.predict(diabetes_X_test), mllearn_predicted) > 0.95) # We are comparable to a similar algorithm in scikit learn
+ def test_linear_regression_cg(self):
+ diabetes = datasets.load_diabetes()
+ diabetes_X = diabetes.data[:, np.newaxis, 2]
+ diabetes_X_train = diabetes_X[:-20]
+ diabetes_X_test = diabetes_X[-20:]
+ diabetes_y_train = diabetes.target[:-20]
+ diabetes_y_test = diabetes.target[-20:]
+ regr = LinearRegression(sparkSession, solver='newton-cg')
+ regr.fit(diabetes_X_train, diabetes_y_train)
+ mllearn_predicted = regr.predict(diabetes_X_test)
+ sklearn_regr = linear_model.LinearRegression()
+ sklearn_regr.fit(diabetes_X_train, diabetes_y_train)
+ self.failUnless(r2_score(sklearn_regr.predict(diabetes_X_test), mllearn_predicted) > 0.95) # We are comparable to a similar algorithm in scikit learn
+
def test_svm(self):
digits = datasets.load_digits()
X_digits = digits.data
@@ -116,8 +150,11 @@ class TestMLLearn(unittest.TestCase):
X_test = X_digits[int(.9 * n_samples):]
y_test = y_digits[int(.9 * n_samples):]
svm = SVM(sparkSession, is_multi_class=True)
- score = svm.fit(X_train, y_train).score(X_test, y_test)
- self.failUnless(score > 0.9)
+ mllearn_predicted = svm.fit(X_train, y_train).predict(X_test)
+ from sklearn import linear_model, svm
+ clf = svm.LinearSVC()
+ sklearn_predicted = clf.fit(X_train, y_train).predict(X_test)
+ self.failUnless(accuracy_score(sklearn_predicted, mllearn_predicted) > 0.95 )
def test_naive_bayes(self):
digits = datasets.load_digits()
@@ -129,22 +166,26 @@ class TestMLLearn(unittest.TestCase):
X_test = X_digits[int(.9 * n_samples):]
y_test = y_digits[int(.9 * n_samples):]
nb = NaiveBayes(sparkSession)
- score = nb.fit(X_train, y_train).score(X_test, y_test)
- self.failUnless(score > 0.8)
+ mllearn_predicted = nb.fit(X_train, y_train).predict(X_test)
+ from sklearn.naive_bayes import MultinomialNB
+ clf = MultinomialNB()
+ sklearn_predicted = clf.fit(X_train, y_train).predict(X_test)
+ self.failUnless(accuracy_score(sklearn_predicted, mllearn_predicted) > 0.95 )
- #def test_naive_bayes1(self):
- # categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
- # newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
- # newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
- # vectorizer = TfidfVectorizer()
- # # Both vectors and vectors_test are SciPy CSR matrix
- # vectors = vectorizer.fit_transform(newsgroups_train.data)
- # vectors_test = vectorizer.transform(newsgroups_test.data)
- # nb = NaiveBayes(sparkSession)
- # nb.fit(vectors, newsgroups_train.target)
- # pred = nb.predict(vectors_test)
- # score = metrics.f1_score(newsgroups_test.target, pred, average='weighted')
- # self.failUnless(score > 0.8)
+ def test_naive_bayes1(self):
+ categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
+ newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
+ newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
+ vectorizer = TfidfVectorizer()
+ # Both vectors and vectors_test are SciPy CSR matrix
+ vectors = vectorizer.fit_transform(newsgroups_train.data)
+ vectors_test = vectorizer.transform(newsgroups_test.data)
+ nb = NaiveBayes(sparkSession)
+ mllearn_predicted = nb.fit(vectors, newsgroups_train.target).predict(vectors_test)
+ from sklearn.naive_bayes import MultinomialNB
+ clf = MultinomialNB()
+ sklearn_predicted = clf.fit(vectors, newsgroups_train.target).predict(vectors_test)
+ self.failUnless(accuracy_score(sklearn_predicted, mllearn_predicted) > 0.95 )
if __name__ == '__main__':