You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ra...@apache.org on 2017/02/01 03:23:36 UTC
mahout git commit: MAHOUT-1856 Add Framework for Models, Fitters,
and Tests closes apache/mahout#246
Repository: mahout
Updated Branches:
refs/heads/master f8596b866 -> 9a31923ea
MAHOUT-1856 Add Framework for Models, Fitters, and Tests closes apache/mahout#246
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/9a31923e
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/9a31923e
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/9a31923e
Branch: refs/heads/master
Commit: 9a31923eae3727d9d91bd2c2ed8df12a616a577e
Parents: f8596b8
Author: rawkintrevo <tr...@gmail.com>
Authored: Tue Jan 31 21:23:10 2017 -0600
Committer: rawkintrevo <tr...@gmail.com>
Committed: Tue Jan 31 21:23:10 2017 -0600
----------------------------------------------------------------------
.gitignore | 1 +
.../standard/PreprocessorSuite.scala | 26 ++++
.../standard/RegressionSuite.scala | 27 ++++
.../standard/RegressionTestsSuite.scala | 26 ++++
.../math/algorithms/PreprocessorSuite.scala | 24 ++++
.../math/algorithms/RegressionSuite.scala | 25 ++++
.../math/algorithms/RegressionTestsSuite.scala | 24 ++++
.../apache/mahout/math/algorithms/Fitter.scala | 27 ++++
.../apache/mahout/math/algorithms/Model.scala | 26 ++++
.../math/algorithms/SupervisedFitter.scala | 29 +++++
.../math/algorithms/SupervisedModel.scala | 26 ++++
.../math/algorithms/UnsupervisedFitter.scala | 28 ++++
.../math/algorithms/UnsupervisedModel.scala | 24 ++++
.../algorithms/preprocessing/AsFactor.scala | 127 +++++++++++++++++++
.../algorithms/preprocessing/MeanCenter.scala | 91 +++++++++++++
.../preprocessing/PreprocessorModel.scala | 58 +++++++++
.../preprocessing/StandardScaler.scala | 97 ++++++++++++++
.../regression/CochraneOrcuttModel.scala | 100 +++++++++++++++
.../regression/LinearRegressorModel.scala | 124 ++++++++++++++++++
.../regression/OrdinaryLeastSquaresModel.scala | 66 ++++++++++
.../algorithms/regression/RegressorModel.scala | 58 +++++++++
.../regression/tests/AutocorrelationTests.scala | 57 +++++++++
.../regression/tests/FittnessTests.scala | 56 ++++++++
.../math/algorithms/PreprocessorSuiteBase.scala | 59 +++++++++
.../math/algorithms/RegressionSuiteBase.scala | 81 ++++++++++++
.../algorithms/RegressionTestsSuiteBase.scala | 87 +++++++++++++
.../math/algorithms/PreprocessorSuite.scala | 24 ++++
.../math/algorithms/RegressionSuite.scala | 25 ++++
.../math/algorithms/RegressionTestsSuite.scala | 25 ++++
29 files changed, 1448 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index 63490bf..3aee83a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,3 +20,4 @@ mr/temp
temp
foo
math-tests/
+metastore_db/*
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/PreprocessorSuite.scala
----------------------------------------------------------------------
diff --git a/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/PreprocessorSuite.scala b/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/PreprocessorSuite.scala
new file mode 100644
index 0000000..5e2b4ee
--- /dev/null
+++ b/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/PreprocessorSuite.scala
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.flinkbindings.standard
+
+
+import org.apache.mahout.flinkbindings.DistributedFlinkSuite
+import org.apache.mahout.math.algorithms.PreprocessorSuiteBase
+import org.scalatest.FunSuite
+
+class PreprocessorSuite extends FunSuite
+ with DistributedFlinkSuite with PreprocessorSuiteBase
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/RegressionSuite.scala
----------------------------------------------------------------------
diff --git a/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/RegressionSuite.scala b/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/RegressionSuite.scala
new file mode 100644
index 0000000..5cb6183
--- /dev/null
+++ b/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/RegressionSuite.scala
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.flinkbindings.standard
+
+
+import org.apache.mahout.flinkbindings.DistributedFlinkSuite
+import org.apache.mahout.math.algorithms.RegressionSuiteBase
+import org.scalatest.FunSuite
+
+class RegressionSuite extends FunSuite
+ with DistributedFlinkSuite with RegressionSuiteBase
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/RegressionTestsSuite.scala
----------------------------------------------------------------------
diff --git a/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/RegressionTestsSuite.scala b/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/RegressionTestsSuite.scala
new file mode 100644
index 0000000..8ddab41
--- /dev/null
+++ b/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/RegressionTestsSuite.scala
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.flinkbindings.standard
+
+import org.apache.mahout.flinkbindings.DistributedFlinkSuite
+import org.apache.mahout.math.algorithms.RegressionTestsSuiteBase
+import org.scalatest.FunSuite
+
+class RegressionTestsSuite extends FunSuite
+ with DistributedFlinkSuite with RegressionTestsSuiteBase
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/h2o/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuite.scala
----------------------------------------------------------------------
diff --git a/h2o/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuite.scala b/h2o/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuite.scala
new file mode 100644
index 0000000..e777f8b
--- /dev/null
+++ b/h2o/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuite.scala
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.algorithms
+
+import org.apache.mahout.h2obindings.test.DistributedH2OSuite
+import org.scalatest.FunSuite
+
+class PreprocessorSuite extends FunSuite
+ with DistributedH2OSuite with PreprocessorSuiteBase
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/h2o/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuite.scala
----------------------------------------------------------------------
diff --git a/h2o/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuite.scala b/h2o/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuite.scala
new file mode 100644
index 0000000..503eb06
--- /dev/null
+++ b/h2o/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuite.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.algorithms
+
+import org.apache.mahout.h2obindings.test.DistributedH2OSuite
+import org.scalatest.FunSuite
+
+class RegressionSuite extends FunSuite
+ with DistributedH2OSuite with RegressionSuiteBase
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/h2o/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuite.scala
----------------------------------------------------------------------
diff --git a/h2o/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuite.scala b/h2o/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuite.scala
new file mode 100644
index 0000000..864b045
--- /dev/null
+++ b/h2o/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuite.scala
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.algorithms
+
+import org.apache.mahout.h2obindings.test.DistributedH2OSuite
+import org.scalatest.FunSuite
+
+class RegressionTestsSuite extends FunSuite
+ with DistributedH2OSuite with RegressionTestsSuiteBase
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Fitter.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Fitter.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Fitter.scala
new file mode 100644
index 0000000..244cefc
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Fitter.scala
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.math.algorithms
+
+trait Fitter {
+
+ // all models must have a fit method... signatures change.
+ // leaving this as place holder incase we decide there are somethings all Models must have in common
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Model.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Model.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Model.scala
new file mode 100644
index 0000000..0fbe8ac
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Model.scala
@@ -0,0 +1,26 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.math.algorithms
+
+trait Model extends Serializable {
+
+ var summary: String = ""
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedFitter.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedFitter.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedFitter.scala
new file mode 100644
index 0000000..bf85dee
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedFitter.scala
@@ -0,0 +1,29 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.math.algorithms
+
+import org.apache.mahout.math.drm.DrmLike
+
+trait SupervisedFitter[K, M <: SupervisedModel[K]] extends Fitter {
+
+ def fit(drmX : DrmLike[K],
+ drmTarget: DrmLike[K],
+ hyperparameters: (Symbol, Any)*): M
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedModel.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedModel.scala
new file mode 100644
index 0000000..57c20e7
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedModel.scala
@@ -0,0 +1,26 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.math.algorithms
+
+import scala.collection.mutable
+
+trait SupervisedModel[K] extends Model {
+ var testResults: mutable.Map[Symbol, Any] = mutable.Map[Symbol, Any]()
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedFitter.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedFitter.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedFitter.scala
new file mode 100644
index 0000000..5c191d1
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedFitter.scala
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.math.algorithms
+
+import org.apache.mahout.math.drm.DrmLike
+
+trait UnsupervisedFitter extends Fitter {
+
+ def fit[K](input: DrmLike[K],
+ hyperparameters: (Symbol, Any)*): UnsupervisedModel
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedModel.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedModel.scala
new file mode 100644
index 0000000..f8ff341
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedModel.scala
@@ -0,0 +1,24 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.math.algorithms
+
+trait UnsupervisedModel extends Model {
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/AsFactor.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/AsFactor.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/AsFactor.scala
new file mode 100644
index 0000000..9d8e10f
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/AsFactor.scala
@@ -0,0 +1,127 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.math.algorithms.preprocessing
+
+
+
+import collection._
+import JavaConversions._
+import org.apache.mahout.math._
+import org.apache.mahout.math.drm._
+import org.apache.mahout.math.{Vector => MahoutVector}
+import org.apache.mahout.math.drm.RLikeDrmOps._
+import org.apache.mahout.math.scalabindings._
+import org.apache.mahout.math.scalabindings.RLikeOps._
+import MahoutCollections._
+
+class AsFactor extends PreprocessorFitter {
+
+ def fit[K](input: DrmLike[K],
+ hyperparameters: (Symbol, Any)*): AsFactorModel = {
+
+ import org.apache.mahout.math.function.VectorFunction
+ val factorMap = input.allreduceBlock(
+ { case (keys, block: Matrix) =>
+ // someday we'll replace this with block.max: Vector
+ // or better yet- block.distinct
+ dense(block.aggregateColumns( new VectorFunction {
+ def apply(f: Vector): Double = f.max
+ }))
+ })(0, ::)
+ /*
+ val A = drmParallelize(dense(
+ (3, 2, 1),
+ (0, 0, 0),
+ (1, 1, 1))
+ -> (4,2,2), now 4,3,2
+ */
+ new AsFactorModel(factorMap.sum.toInt,
+ dvec(factorMap.toArray.scanLeft(0.0)((l, r) => l + r ).take(factorMap.length))
+ // factorMap
+ )
+ }
+
+}
+
+class AsFactorModel(cardinality: Int, factorVec: MahoutVector) extends PreprocessorModel {
+
+ val factorMap: MahoutVector = factorVec
+
+ def transform[K](input: DrmLike[K]): DrmLike[K] ={
+
+ implicit val ctx = input.context
+
+ val bcastK = drmBroadcast(dvec(cardinality))
+ val bcastFactorMap = drmBroadcast(factorMap)
+
+ implicit val ktag = input.keyClassTag
+
+ val res = input.mapBlock(cardinality) {
+ case (keys, block: Matrix) => {
+ val cardinality: Int = bcastK.value.get(0).toInt
+ val output = new SparseMatrix(block.nrow, cardinality)
+ // This is how we take a vector of mapping to a map
+ val fm = bcastFactorMap.value
+ for (n <- 0 until output.nrow){
+ var m = 0
+ for (e <- block(n, ::).all() ){
+ output(n, fm.get(m).toInt + e.get().toInt ) = 1.0
+ m += 1
+ }
+ }
+ (keys, output)
+ }
+ }
+ res
+ }
+
+ override def invTransform[K](input: DrmLike[K]): DrmLike[K] = {
+ implicit val ctx = input.context
+
+ val bcastK = drmBroadcast(dvec(cardinality))
+ val bcastFactorMap = drmBroadcast(factorMap)
+
+ implicit val ktag = input.keyClassTag
+
+ val res = input.mapBlock(cardinality) {
+ case (keys, block: Matrix) => {
+ val k: Int = bcastK.value.get(0).toInt
+ val output = new DenseMatrix(block.nrow, bcastK.value.length)
+ // This is how we take a vector of mapping to a map
+ val fm = bcastFactorMap.all.toSeq.map(e => e.get -> e.index).toMap
+
+ import MahoutCollections._
+ val indexArray = Array(1.0) ++ bcastFactorMap.value.toArray.map(i => i.toInt)
+ for (n <- 0 until output.nrow){
+ val v = new DenseVector(bcastFactorMap.value.length)
+ var m = 0
+ for (e <- block(n, ::).asInstanceOf[RandomAccessSparseVector].iterateNonZero() ){
+ v.setQuick(m, e.index - m)
+ m += 1
+ }
+ output(n, ::) = v
+ }
+ (keys, output)
+ }
+ }
+ res
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/MeanCenter.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/MeanCenter.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/MeanCenter.scala
new file mode 100644
index 0000000..258ad1b
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/MeanCenter.scala
@@ -0,0 +1,91 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.math.algorithms.preprocessing
+
+import collection._
+import JavaConversions._
+import org.apache.mahout.math.drm._
+import org.apache.mahout.math.drm.RLikeDrmOps._
+import org.apache.mahout.math.Matrix
+import org.apache.mahout.math.scalabindings.RLikeOps._
+import org.apache.mahout.math.{Vector => MahoutVector}
+
+
+
+class MeanCenter extends PreprocessorFitter {
+
+ /**
+ * Centers Columns at zero or centers
+ * @param input A drm which to center on
+ *
+ */
+ def fit[K](input: DrmLike[K],
+ hyperparameters: (Symbol, Any)*): MeanCenterModel = {
+ new MeanCenterModel(input.colMeans()) // could add centers here
+ }
+
+}
+
+/**
+ * A model for mean centering each column of a data set at 0 or some number specified by the setCenters method.
+ * @param means
+ */
+class MeanCenterModel(means: MahoutVector) extends PreprocessorModel {
+
+ var colCentersV: MahoutVector = means
+
+ def setCenters(centers: MahoutVector): Unit = {
+ if (means.length != centers.length){
+ throw new Exception(s"Length of centers vector (${centers.length}) must equal length of means vector ((${means.length}) (e.g. the number of columns in the orignally fit input).")
+ }
+ colCentersV = means + centers
+ }
+ def transform[K](input: DrmLike[K]): DrmLike[K] = {
+
+ implicit val ctx = input.context
+ implicit val ktag = input.keyClassTag
+
+ val bcastV = drmBroadcast(colCentersV)
+
+ val output = input.mapBlock(input.ncol) {
+ case (keys, block: Matrix) =>
+ val copy: Matrix = block.cloned
+ copy.foreach(row => row -= bcastV.value)
+ (keys, copy)
+ }
+ output
+ }
+
+ def invTransform[K](input: DrmLike[K]): DrmLike[K] = {
+
+ implicit val ctx = input.context
+ implicit val ktag = input.keyClassTag
+ val bcastV = drmBroadcast(colCentersV)
+
+ val output = input.mapBlock(input.ncol) {
+ case (keys, block: Matrix) =>
+ val copy: Matrix = block.cloned
+ copy.foreach(row => row += bcastV.value)
+ (keys, copy)
+ }
+ output
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/PreprocessorModel.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/PreprocessorModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/PreprocessorModel.scala
new file mode 100644
index 0000000..5adb87d
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/PreprocessorModel.scala
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.math.algorithms.preprocessing
+
+import org.apache.mahout.math.algorithms.{UnsupervisedFitter, UnsupervisedModel}
+import org.apache.mahout.math.drm.DrmLike
+
+trait PreprocessorModel extends UnsupervisedModel {
+
+ /**
+ * A convenience method for returning transformed data back to original
+ * @param input
+ * @tparam K
+ * @return
+ */
+ def invTransform[K](input: DrmLike[K]): DrmLike[K]
+
+ /**
+ * Transform given Drm given the feature set
+ * @param input
+
+ */
+ def transform[K](input: DrmLike[K]): DrmLike[K]
+
+}
+
+trait PreprocessorFitter extends UnsupervisedFitter {
+
+ def fit[K](input: DrmLike[K],
+ hyperparameters: (Symbol, Any)*): PreprocessorModel
+
+ def fitTransform[K](input: DrmLike[K],
+ hyperparameters: (Symbol, Any)*): DrmLike[K] = {
+ model = this.fit(input, hyperparameters:_*)
+ model.transform(input)
+
+ }
+
+ // used to store the model if `fitTransform` method called
+ var model: PreprocessorModel = _
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/StandardScaler.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/StandardScaler.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/StandardScaler.scala
new file mode 100644
index 0000000..98d0be1
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/StandardScaler.scala
@@ -0,0 +1,97 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.math.algorithms.preprocessing
+
+import collection._
+import JavaConversions._
+
+import org.apache.mahout.math.drm._
+import org.apache.mahout.math.drm.RLikeDrmOps._
+import org.apache.mahout.math.scalabindings.RLikeOps._
+import org.apache.mahout.math.{Vector => MahoutVector, Matrix}
+
+/**
+ * Scales columns to mean 0 and unit variance
+ */
+class StandardScaler extends PreprocessorFitter {
+
+ def fit[K](input: DrmLike[K],
+ hyperparameters: (Symbol, Any)*): StandardScalerModel = {
+ val mNv = dcolMeanVars(input)
+ new StandardScalerModel(mNv._1, mNv._2.sqrt)
+ }
+
+}
+
+class StandardScalerModel(meanVec: MahoutVector,
+ stdev: MahoutVector
+ ) extends PreprocessorModel {
+
+
+ def transform[K](input: DrmLike[K]): DrmLike[K] = {
+ implicit val ctx = input.context
+
+
+ // Some mapBlock() calls need it
+ // implicit val ktag = input.keyClassTag
+
+ val bcastMu = drmBroadcast(meanVec)
+ val bcastSigma = drmBroadcast(stdev)
+
+ implicit val ktag = input.keyClassTag
+
+ val res = input.mapBlock(input.ncol) {
+ case (keys, block: Matrix) => {
+ val copy: Matrix = block.cloned
+ copy.foreach(row => row := (row - bcastMu) / bcastSigma )
+ (keys, copy)
+ }
+ }
+ res
+ }
+
+ /**
+ * Given a an output- trasform it back into the original
+ * e.g. a normalized column, back to original values.
+ *
+ * @param input
+ * @tparam K
+ * @return
+ */
+ def invTransform[K](input: DrmLike[K]): DrmLike[K] = { // [K: ClassTag]
+
+ implicit val ctx = input.context
+
+ // Some mapBlock() calls need it
+ implicit val ktag = input.keyClassTag
+
+ val bcastMu = drmBroadcast(meanVec)
+ val bcastSigma = drmBroadcast(stdev)
+
+ val res = input.mapBlock(input.ncol) {
+ case (keys, block: Matrix) => {
+ val copy: Matrix = block.cloned
+ copy.foreach(row => row := (row * bcastSigma ) + bcastMu)
+ (keys, copy)
+ }
+ }
+ res
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala
new file mode 100644
index 0000000..844e72f
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.math.algorithms.regression
+
+import org.apache.mahout.math.{Vector => MahoutVector}
+import org.apache.mahout.math.drm.{CacheHint, DrmLike, safeToNonNegInt}
+import org.apache.mahout.math.drm.RLikeDrmOps._
+import org.apache.mahout.math.scalabindings.RLikeOps._
+
+class CochraneOrcuttModel[K](regressor: LinearRegressorModel[K]) extends LinearRegressorModel[K] {
+ // https://en.wikipedia.org/wiki/Cochrane%E2%80%93Orcutt_estimation
+
+ var betas: Array[MahoutVector] = _
+
+ def predict(drmPredictors: DrmLike[K]): DrmLike[K] = {
+ regressor.predict(drmPredictors)
+ }
+
+}
+
+class CochraneOrcutt[K](hyperparameters: (Symbol, Any)*) extends LinearRegressorFitter[K] {
+
+ var regressor: LinearRegressorFitter[K] = hyperparameters.asInstanceOf[Map[Symbol,
+ LinearRegressorFitter[K]]].getOrElse('regressor, new OrdinaryLeastSquares[K]())
+ var iterations: Int = hyperparameters.asInstanceOf[Map[Symbol, Int]].getOrElse('iterations, 3)
+ var cacheHint: CacheHint.CacheHint = hyperparameters.asInstanceOf[Map[Symbol, CacheHint.CacheHint]].getOrElse('cacheHint, CacheHint.MEMORY_ONLY)
+ // For larger inputs, CacheHint.MEMORY_AND_DISK2 is reccomended.
+
+ def setHyperparameters(hyperparameters: Map[Symbol, Any] = Map('foo -> None)): Unit = {
+ setStandardHyperparameters(hyperparameters.toMap)
+ regressor = hyperparameters.asInstanceOf[Map[Symbol, LinearRegressorFitter[K]]].getOrElse('regressor, new OrdinaryLeastSquares())
+ regressor.calcStandardErrors = false
+ regressor.calcCommonStatistics = false
+ iterations = hyperparameters.asInstanceOf[Map[Symbol, Int]].getOrElse('iterations, 3)
+ cacheHint = hyperparameters.asInstanceOf[Map[Symbol, CacheHint.CacheHint]].getOrElse('cacheHint, CacheHint.MEMORY_ONLY)
+ }
+
+ setHyperparameters(hyperparameters.toMap)
+
+ def fit(drmFeatures: DrmLike[K], drmTarget: DrmLike[K], hyperparameters: (Symbol, Any)*): CochraneOrcuttModel[K] = {
+
+ var hyperparameters: Option[Map[String,Any]] = None
+ val betas = new Array[MahoutVector](iterations)
+ var regressionModel: LinearRegressorModel[K] = regressor.fit(drmFeatures, drmTarget)
+ betas(0) = regressionModel.beta
+ // todo add dw test option on each iteration
+
+ val drmY = drmTarget
+ val n = safeToNonNegInt(drmTarget.nrow)
+ val Y = drmTarget(1 until n, 0 until 1).checkpoint(cacheHint)
+ val Y_lag = drmTarget(0 until n - 1, 0 until 1).checkpoint(cacheHint)
+ val X = drmFeatures(1 until n, 0 until 1).checkpoint(cacheHint)
+ val X_lag = drmFeatures(0 until n - 1, 0 until 1).checkpoint(cacheHint)
+ for (i <- 1 until iterations){
+ val error = drmTarget - regressionModel.predict(drmFeatures)
+ regressionModel = regressor.fit(drmFeatures, drmTarget)
+ val rho = regressionModel.beta.get(0)
+
+ val drmYprime = Y - Y_lag * rho
+ val drmXprime = X - X_lag * rho
+
+ if (i == iterations - 1 ){
+ // calculate common stats and SE on last iteration only
+ // todo make this optional- but if you don't care then why are you even bothering to do this?
+ regressor.calcStandardErrors = true
+ regressor.calcCommonStatistics = true
+ }
+ regressionModel = regressor.fit(drmFeatures, drmTarget)
+ var betaPrime = regressionModel.beta
+ val b0 = betaPrime(0) / (1 - rho)
+ betaPrime(0) = b0
+ betas(i) = betaPrime
+ }
+
+ val model = new CochraneOrcuttModel[K](regressionModel)
+ model.betas = betas
+ model.summary = (0 until iterations).map(i \u21d2 s"Beta estimates on iteration " + i + ": "
+ + model.betas.toString + "\n").mkString("") + "\n\n" + "Final Model:\n\n" + regressionModel.summary
+
+ model
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala
new file mode 100644
index 0000000..555ee6c
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.math.algorithms.regression
+
+import org.apache.mahout.math.algorithms.regression.tests.FittnessTests
+import org.apache.mahout.math.drm._
+import org.apache.mahout.math.drm.DrmLike
+import org.apache.mahout.math.drm.RLikeDrmOps._
+import org.apache.mahout.math.scalabindings.dvec
+import org.apache.mahout.math.{Matrix, Vector => MahoutVector}
+import org.apache.mahout.math.scalabindings.RLikeOps._
+import scala.language.higherKinds
+
+trait LinearRegressorModel[K] extends RegressorModel[K] {
+
+ var beta: MahoutVector = _
+ var se: MahoutVector = _
+ var tScore: MahoutVector = _
+ var pval: MahoutVector = _
+ var degreesFreedom: Int = _
+
+}
+
+trait LinearRegressorFitter[K] extends RegressorFitter[K] {
+
+ var addIntercept: Boolean = _
+ var calcStandardErrors: Boolean = _
+ var calcCommonStatistics: Boolean = _
+
+ def fit(drmX: DrmLike[K],
+ drmTarget: DrmLike[K],
+ hyperparameters: (Symbol, Any)*): LinearRegressorModel[K]
+
+
+ def setStandardHyperparameters(hyperparameters: Map[Symbol, Any] = Map('foo -> None)): Unit = {
+ calcCommonStatistics = hyperparameters.asInstanceOf[Map[Symbol, Boolean]].getOrElse('calcCommonStatistics, true)
+ calcStandardErrors = hyperparameters.asInstanceOf[Map[Symbol, Boolean]].getOrElse('calcStandardErrors, true)
+ addIntercept = hyperparameters.asInstanceOf[Map[Symbol, Boolean]].getOrElse('addIntercept, true)
+ }
+
+ def calculateStandardError[M[K] <: LinearRegressorModel[K]](X: DrmLike[K],
+ drmTarget: DrmLike[K],
+ drmXtXinv: Matrix,
+ model: M[K]): M[K] = {
+ import org.apache.mahout.math.function.Functions.SQRT
+ import org.apache.mahout.math.scalabindings.MahoutCollections._
+ var modelOut = model
+ val yhat = X %*% model.beta
+ val residuals = drmTarget - yhat
+ val ete = (residuals.t %*% residuals).collect // 1x1
+ val n = drmTarget.nrow
+ val k = safeToNonNegInt(X.ncol)
+ val invDegFreedomKindOf = 1.0 / (n - k)
+ val varCovarMatrix = invDegFreedomKindOf * ete(0,0) * drmXtXinv
+ val se = varCovarMatrix.viewDiagonal.assign(SQRT)
+ val tScore = model.beta / se
+ val tDist = new org.apache.commons.math3.distribution.TDistribution(n-k)
+ val pval = dvec(tScore.toArray.map(t => 2 * (1.0 - tDist.cumulativeProbability(t)) ))
+ // ^^ TODO bug in this calculation- fix and add test
+ //degreesFreedom = k
+ modelOut.summary = "Coef.\t\tEstimate\t\tStd. Error\t\tt-score\t\t\tPr(Beta=0)\n" +
+ (0 until k).map(i => s"X${i}\t${model.beta(i)}\t${se(i)}\t${tScore(i)}\t${pval(i)}").mkString("\n")
+
+ modelOut.se = se
+ modelOut.tScore = tScore
+ modelOut.pval = pval
+ modelOut.degreesFreedom = X.ncol
+
+ if (calcCommonStatistics){
+ modelOut = calculateCommonStatistics(modelOut, drmTarget, residuals)
+ }
+ modelOut
+ }
+
+ def calculateCommonStatistics[M[K] <: LinearRegressorModel[K]](model: M[K],
+ drmTarget: DrmLike[K],
+ residuals: DrmLike[K]): M[K] ={
+ var modelOut = model
+ modelOut = FittnessTests.CoefficientOfDetermination(model, drmTarget, residuals)
+ modelOut = FittnessTests.MeanSquareError(model, residuals)
+ modelOut
+ }
+
+ def modelPostprocessing[M[K] <: LinearRegressorModel[K]](model: M[K],
+ X: DrmLike[K],
+ drmTarget: DrmLike[K],
+ drmXtXinv: Matrix): M[K] = {
+ var modelOut = model
+ if (calcStandardErrors) {
+ modelOut = calculateStandardError(X, drmTarget, drmXtXinv, model )
+ } else {
+ modelOut.summary = "Coef.\t\tEstimate\n" +
+ (0 until X.ncol).map(i => s"X${i}\t${modelOut.beta(i)}").mkString("\n")
+ if (calcCommonStatistics) { // we do this in calcStandard errors to avoid calculating residuals twice
+ val residuals = drmTarget - (X %*% modelOut.beta)
+ modelOut = calculateCommonStatistics(modelOut, drmTarget, residuals)
+ }
+
+ modelOut
+ }
+
+ if (addIntercept) {
+ model.summary.replace(s"X${X.ncol - 1}", "(Intercept)")
+ }
+ model
+ }
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala
new file mode 100644
index 0000000..d59701a
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.math.algorithms.regression
+
+import org.apache.mahout.math.drm.RLikeDrmOps._
+import org.apache.mahout.math.drm.DrmLike
+import org.apache.mahout.math.scalabindings._
+import org.apache.mahout.math.scalabindings.RLikeOps._
+
+class OrdinaryLeastSquaresModel[K]
+ extends LinearRegressorModel[K] {
+ // https://en.wikipedia.org/wiki/Ordinary_least_squares
+
+ def predict(drmPredictors: DrmLike[K]): DrmLike[K] = {
+ drmPredictors %*% beta
+ }
+
+}
+
+class OrdinaryLeastSquares[K] extends LinearRegressorFitter[K] {
+
+
+ def fit(drmFeatures: DrmLike[K],
+ drmTarget: DrmLike[K],
+ hyperparameters: (Symbol, Any)*): OrdinaryLeastSquaresModel[K] = {
+
+ var model = new OrdinaryLeastSquaresModel[K]()
+ setStandardHyperparameters(hyperparameters.toMap)
+
+
+ if (drmFeatures.nrow != drmTarget.nrow){
+ throw new Exception(s"${drmFeatures.nrow} observations in features, ${drmTarget.nrow} observations in target, must be equal.")
+ }
+
+ var X = drmFeatures
+ if (addIntercept) {
+ X = X cbind 1
+ }
+
+ val XtX = X.t %*% X
+ XtX.collect
+ val drmXtXinv = solve(X.t %*% X)
+ val drmXty = (X.t %*% drmTarget).collect // this fails when number of columns^2 size matrix won't fit in driver
+ model.beta = (drmXtXinv %*% drmXty)(::, 0)
+
+
+ this.modelPostprocessing(model, X, drmTarget, drmXtXinv)
+ }
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala
new file mode 100644
index 0000000..bdddb29
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.math.algorithms.regression
+
+import org.apache.mahout.math.algorithms.{SupervisedFitter, SupervisedModel}
+import org.apache.mahout.math.drm.DrmLike
+
+trait RegressorModel[K] extends SupervisedModel[K] {
+
+ def predict(drmPredictors: DrmLike[K]): DrmLike[K]
+
+ // Common Applicable Tests- here only for convenience.
+ var mse: Double = _
+ var r2: Double = _
+
+ /**
+ * Syntatictic sugar for fetching test results. Will Return test result if it exists, otherwise None
+ * @param testSymbol - symbol of the test result to fetch, e.g. `'mse`
+ * @tparam T - The Type
+ * @return
+ */
+ def getTestResult[T](testSymbol: Symbol): Option[T] = {
+ Some(testResults.get(testSymbol).asInstanceOf[T])
+ }
+}
+
+trait RegressorFitter[K] extends SupervisedFitter[K, RegressorModel[K]] {
+
+
+ def fitPredict(drmX: DrmLike[K],
+ drmTarget: DrmLike[K],
+ hyperparameters: (Symbol, Any)* ): DrmLike[K] = {
+
+ model = this.fit(drmX, drmTarget, hyperparameters: _* )
+ model.predict(drmX)
+ }
+
+ // used to store the model if `fitTransform` method called
+ var model: RegressorModel[K] = _
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/AutocorrelationTests.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/AutocorrelationTests.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/AutocorrelationTests.scala
new file mode 100644
index 0000000..2b16b74
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/AutocorrelationTests.scala
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.math.algorithms.regression.tests
+
+import org.apache.mahout.math.algorithms.regression.RegressorModel
+import org.apache.mahout.math.drm._
+import org.apache.mahout.math.drm.DrmLike
+import org.apache.mahout.math.drm.RLikeDrmOps._
+import org.apache.mahout.math.function.Functions.SQUARE
+import org.apache.mahout.math.scalabindings.RLikeOps._
+import scala.language.higherKinds
+
+object AutocorrelationTests {
+
+ //https://en.wikipedia.org/wiki/Durbin%E2%80%93Watson_statistic
+ /*
+ To test for positive autocorrelation at significance \u03b1, the test statistic d is compared to lower and upper critical values (dL,\u03b1 and dU,\u03b1):
+ If d < dL,\u03b1, there is statistical evidence that the error terms are positively autocorrelated.
+ If d > dU,\u03b1, there is no statistical evidence that the error terms are positively autocorrelated.
+ If dL,\u03b1 < d < dU,\u03b1, the test is inconclusive.
+
+ Rule of Thumb:
+ d < 2 : positive auto-correlation
+ d = 2 : no auto-correlation
+ d > 2 : negative auto-correlation
+ */
+ def DurbinWatson[R[K] <: RegressorModel[K], K](model: R[K], residuals: DrmLike[K]): R[K] = {
+
+ val n = safeToNonNegInt(residuals.nrow)
+ val e: DrmLike[K] = residuals(1 until n , 0 until 1)
+ val e_t_1: DrmLike[K] = residuals(0 until n - 1, 0 until 1)
+ val numerator = (e - e_t_1).assign(SQUARE).colSums()
+ val denominator = residuals.assign(SQUARE).colSums()
+ val dw = numerator / denominator
+ model.testResults += ('durbinWatsonTestStatistic \u2192 dw.get(0))
+ model.summary += s"\nDurbin Watson Test Statistic: ${dw.toString}"
+ model
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala
new file mode 100644
index 0000000..d1dd3bb
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.math.algorithms.regression.tests
+
+import org.apache.mahout.math.algorithms.regression.RegressorModel
+import org.apache.mahout.math.algorithms.preprocessing.MeanCenter
+import org.apache.mahout.math.drm.DrmLike
+import org.apache.mahout.math.function.Functions.SQUARE
+import org.apache.mahout.math.scalabindings.RLikeOps._
+
+import scala.language.higherKinds
+import scala.reflect.ClassTag
+
+object FittnessTests {
+
+ // https://en.wikipedia.org/wiki/Coefficient_of_determination
+ def CoefficientOfDetermination[R[K] <: RegressorModel[K], K](model: R[K],
+ drmTarget: DrmLike[K],
+ residuals: DrmLike[K]): R[K] = {
+ val sumSquareResiduals = residuals.assign(SQUARE).sum
+ val mc = new MeanCenter()
+ val totalResiduals = mc.fitTransform(drmTarget)
+ val sumSquareTotal = totalResiduals.assign(SQUARE).sum
+ val r2 = 1 - (sumSquareResiduals / sumSquareTotal)
+ model.r2 = r2
+ model.testResults += ('r2 -> r2) // need setResult and setSummary method incase you change in future, also to initialize map if non exists or update value if it does
+ model.summary += s"\nR^2: ${r2}"
+ model
+ }
+
+ // https://en.wikipedia.org/wiki/Mean_squared_error
+ def MeanSquareError[R[K] <: RegressorModel[K], K](model: R[K], residuals: DrmLike[K]): R[K] = {
+ val mse = residuals.assign(SQUARE).sum / residuals.nrow
+ model.mse = mse
+ model.testResults += ('mse -> mse)
+ model.summary += s"\nMean Squared Error: ${mse}"
+ model
+ }
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuiteBase.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuiteBase.scala
new file mode 100644
index 0000000..9e8f029
--- /dev/null
+++ b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuiteBase.scala
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.math.algorithms
+
+import org.apache.mahout.math.algorithms.preprocessing.{AsFactor, AsFactorModel}
+import org.apache.mahout.math.drm.drmParallelize
+import org.apache.mahout.math.scalabindings.{dense, sparse, svec}
+import org.apache.mahout.math.scalabindings.RLikeOps._
+import org.apache.mahout.test.DistributedMahoutSuite
+import org.scalatest.{FunSuite, Matchers}
+
+trait PreprocessorSuiteBase extends DistributedMahoutSuite with Matchers {
+
+ this: FunSuite =>
+
+ test("asfactor test") {
+ val A = drmParallelize(dense(
+ (3, 2, 1, 2),
+ (0, 0, 0, 0),
+ (1, 1, 1, 1)), numPartitions = 2)
+
+ // 0 -> 2, 3 -> 5, 6 -> 9
+ val factorizer: AsFactorModel = new AsFactor().fit(A)
+
+ val factoredA = factorizer.transform(A)
+
+ println(factoredA)
+ println(factorizer.factorMap)
+ val correctAnswer = sparse(
+ svec((3 \u2192 1.0) :: (6 \u2192 1.0) :: (8 \u2192 1.0) :: (11 \u2192 1.0) :: Nil, cardinality = 12),
+ svec((0 \u2192 1.0) :: (4 \u2192 1.0) :: (7 \u2192 1.0) :: ( 9 \u2192 1.0) :: Nil, cardinality = 12),
+ svec((1 \u2192 1.0) :: (5 \u2192 1.0) :: (8 \u2192 1.0) :: (10 \u2192 1.0) :: Nil, cardinality = 12)
+ )
+
+ val myAnswer = factoredA.collect
+
+ val epsilon = 1E-6
+ (myAnswer.norm - correctAnswer.norm) should be <= epsilon
+ (myAnswer.norm - correctAnswer.norm) should be <= epsilon
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala
new file mode 100644
index 0000000..2bb0343
--- /dev/null
+++ b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.algorithms
+
+import org.apache.mahout.math.algorithms.regression.OrdinaryLeastSquares
+import org.apache.mahout.math.drm._
+import org.apache.mahout.math.drm.RLikeDrmOps._
+import org.apache.mahout.math.scalabindings._
+import org.apache.mahout.math.scalabindings.RLikeOps._
+import org.apache.mahout.test.DistributedMahoutSuite
+import org.scalatest.{FunSuite, Matchers}
+
+trait RegressionSuiteBase extends DistributedMahoutSuite with Matchers {
+ this: FunSuite =>
+
+ test("ordinary least squares") {
+ /*
+ R Prototype:
+ dataM <- matrix( c(2, 2, 10.5, 10, 29.509541,
+ 1, 2, 12, 12, 18.042851,
+ 1, 1, 12, 13, 22.736446,
+ 2, 1, 11, 13, 32.207582,
+ 1, 2, 12, 11, 21.871292,
+ 2, 1, 16, 8, 36.187559,
+ 6, 2, 17, 1, 50.764999,
+ 3, 2, 13, 7, 40.400208,
+ 3, 3, 13, 4, 45.811716), nrow=9, ncol=5, byrow=TRUE)
+
+
+ X = dataM[, c(1,2,3,4)]
+ y = dataM[, c(5)]
+
+ model <- lm(y ~ X )
+ summary(model)
+
+ */
+
+ val drmData = drmParallelize(dense(
+ (2, 2, 10.5, 10, 29.509541), // Apple Cinnamon Cheerios
+ (1, 2, 12, 12, 18.042851), // Cap'n'Crunch
+ (1, 1, 12, 13, 22.736446), // Cocoa Puffs
+ (2, 1, 11, 13, 32.207582), // Froot Loops
+ (1, 2, 12, 11, 21.871292), // Honey Graham Ohs
+ (2, 1, 16, 8, 36.187559), // Wheaties Honey Gold
+ (6, 2, 17, 1, 50.764999), // Cheerios
+ (3, 2, 13, 7, 40.400208), // Clusters
+ (3, 3, 13, 4, 45.811716)), numPartitions = 2)
+
+
+ val drmX = drmData(::, 0 until 4)
+ val drmY = drmData(::, 4 until 5)
+
+ val model = new OrdinaryLeastSquares[Int]().fit(drmX, drmY, 'calcCommonStatistics \u2192 false)
+
+ val estimate = model.beta
+ val Ranswers = dvec(-1.336265, -13.157702, -4.152654, -5.679908, 163.179329)
+
+ val epsilon = 1E-6
+ (estimate - Ranswers).sum should be < epsilon
+
+ // TODO add test for S.E / pvalue
+ }
+
+
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala
new file mode 100644
index 0000000..1178a9b
--- /dev/null
+++ b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala
@@ -0,0 +1,87 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.math.algorithms
+
+import org.apache.mahout.math.algorithms.regression.OrdinaryLeastSquares
+import org.apache.mahout.math.drm.drmParallelize
+import org.apache.mahout.math.drm.RLikeDrmOps._
+import org.apache.mahout.math.scalabindings.{`::`, dense}
+import org.apache.mahout.test.DistributedMahoutSuite
+import org.scalatest.{FunSuite, Matchers}
+
+
+trait RegressionTestsSuiteBase extends DistributedMahoutSuite with Matchers {
+ this: FunSuite =>
+
+ val epsilon = 1E-4
+
+ test("fittness tests") {
+ /*
+ R Prototype:
+ dataM <- matrix( c(2, 2, 10.5, 10, 29.509541,
+ 1, 2, 12, 12, 18.042851,
+ 1, 1, 12, 13, 22.736446,
+ 2, 1, 11, 13, 32.207582,
+ 1, 2, 12, 11, 21.871292,
+ 2, 1, 16, 8, 36.187559,
+ 6, 2, 17, 1, 50.764999,
+ 3, 2, 13, 7, 40.400208,
+ 3, 3, 13, 4, 45.811716), nrow=9, ncol=5, byrow=TRUE)
+
+
+ X = dataM[, c(1,2,3,4)]
+ y = dataM[, c(5)]
+
+ model <- lm(y ~ X)
+ summary(model)
+
+ */
+
+ val drmData = drmParallelize(dense(
+ (2, 2, 10.5, 10, 29.509541), // Apple Cinnamon Cheerios
+ (1, 2, 12, 12, 18.042851), // Cap'n'Crunch
+ (1, 1, 12, 13, 22.736446), // Cocoa Puffs
+ (2, 1, 11, 13, 32.207582), // Froot Loops
+ (1, 2, 12, 11, 21.871292), // Honey Graham Ohs
+ (2, 1, 16, 8, 36.187559), // Wheaties Honey Gold
+ (6, 2, 17, 1, 50.764999), // Cheerios
+ (3, 2, 13, 7, 40.400208), // Clusters
+ (3, 3, 13, 4, 45.811716)), numPartitions = 2)
+
+ val drmX = drmData(::, 0 until 4)
+ val drmY = drmData(::, 4 until 5)
+
+ val model = new OrdinaryLeastSquares[Int]().fit(drmX, drmY)
+
+ println(model.summary)
+ // Answers from running similar algorithm in R
+ val rR2 = 0.9425
+ val rMSE = 6.457157
+
+ val r2: Double = model.r2
+ val mse: Double = model.mse
+ println("R2: " + r2)
+ println("MSE: " + mse)
+ (rR2 - r2) should be < epsilon
+ (rMSE - mse) should be < epsilon
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/spark/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuite.scala
----------------------------------------------------------------------
diff --git a/spark/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuite.scala b/spark/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuite.scala
new file mode 100644
index 0000000..4a1f074
--- /dev/null
+++ b/spark/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuite.scala
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.algorithms
+
+import org.apache.mahout.sparkbindings.test.DistributedSparkSuite
+import org.scalatest.FunSuite
+
+class PreprocessorSuite extends FunSuite
+ with DistributedSparkSuite with PreprocessorSuiteBase
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/spark/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuite.scala
----------------------------------------------------------------------
diff --git a/spark/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuite.scala b/spark/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuite.scala
new file mode 100644
index 0000000..bb99d61
--- /dev/null
+++ b/spark/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuite.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.algorithms
+
+import org.apache.mahout.sparkbindings.test.DistributedSparkSuite
+import org.scalatest.FunSuite
+
+class RegressionSuite extends FunSuite
+ with DistributedSparkSuite with RegressionSuiteBase
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/spark/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuite.scala
----------------------------------------------------------------------
diff --git a/spark/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuite.scala b/spark/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuite.scala
new file mode 100644
index 0000000..07864f8
--- /dev/null
+++ b/spark/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuite.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.algorithms
+
+import org.apache.mahout.sparkbindings.test.DistributedSparkSuite
+import org.scalatest.FunSuite
+
+class RegressionTestsSuite extends FunSuite
+ with DistributedSparkSuite with RegressionTestsSuiteBase
+