You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by de...@apache.org on 2017/02/04 03:34:55 UTC
incubator-systemml git commit: [SYSTEMML-1224] Migrate Vector and
LabeledPoint classes from mllib to ml
Repository: incubator-systemml
Updated Branches:
refs/heads/master 4049ce407 -> 578e595fd
[SYSTEMML-1224] Migrate Vector and LabeledPoint classes from mllib to ml
Migrate:
mllib.linalg.DenseVector to ml.linalg.DenseVector.
mllib.linalg.Vector to ml.linalg.Vector.
mllib.linalg.Vectors to ml.linalg.Vectors.
mllib.linalg.VectorUDT to ml.linalg.VectorUDT.
mllib.regression.LabeledPoint to ml.feature.LabeledPoint.
Closes #369.
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/578e595f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/578e595f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/578e595f
Branch: refs/heads/master
Commit: 578e595fdc506fb8a0c0b18c312fe420a406276d
Parents: 4049ce4
Author: Deron Eriksson <de...@us.ibm.com>
Authored: Fri Feb 3 19:31:44 2017 -0800
Committer: Deron Eriksson <de...@us.ibm.com>
Committed: Fri Feb 3 19:31:44 2017 -0800
----------------------------------------------------------------------
.../java/org/apache/sysml/api/MLOutput.java | 2 +-
.../api/mlcontext/MLContextConversionUtil.java | 2 +-
.../sysml/api/mlcontext/MLContextUtil.java | 2 +-
.../spark/utils/FrameRDDConverterUtils.java | 4 +-
.../spark/utils/RDDConverterUtils.java | 10 +--
.../spark/utils/RDDConverterUtilsExt.java | 69 +-------------------
.../sysml/api/ml/LogisticRegression.scala | 4 +-
.../DataFrameVectorFrameConversionTest.java | 4 +-
.../mlcontext/DataFrameVectorScriptTest.java | 4 +-
.../integration/mlcontext/MLContextTest.java | 6 +-
.../sysml/api/ml/LogisticRegressionSuite.scala | 6 +-
11 files changed, 25 insertions(+), 88 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/api/MLOutput.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/MLOutput.java b/src/main/java/org/apache/sysml/api/MLOutput.java
index 6acca68..08a9a00 100644
--- a/src/main/java/org/apache/sysml/api/MLOutput.java
+++ b/src/main/java/org/apache/sysml/api/MLOutput.java
@@ -108,7 +108,7 @@ public class MLOutput {
* Obtain the DataFrame
* @param sqlContext the SQLContext
* @param varName the variable name
- * @param outputVector if true, returns DataFrame with two column: ID and org.apache.spark.mllib.linalg.Vector
+ * @param outputVector if true, returns DataFrame with two column: ID and org.apache.spark.ml.linalg.Vector
* @return the DataFrame
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
index ca853ef..cca9d2c 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
@@ -33,7 +33,7 @@ import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.VectorUDT;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java b/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java
index 75e9c1e..9553acb 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java
@@ -39,7 +39,7 @@ import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.VectorUDT;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java
index 3196f09..ae3b686 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java
@@ -37,8 +37,8 @@ import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.Vector;
+import org.apache.spark.ml.linalg.VectorUDT;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java
index 356d16f..b5a4b58 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java
@@ -35,11 +35,11 @@ import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.mllib.linalg.DenseVector;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.VectorUDT;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.ml.linalg.DenseVector;
+import org.apache.spark.ml.linalg.Vector;
+import org.apache.spark.ml.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.ml.feature.LabeledPoint;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
index f18c0a9..e0d347f 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
@@ -35,9 +35,9 @@ import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFlatMapFunction;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.VectorUDT;
-import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.ml.linalg.Vector;
+import org.apache.spark.ml.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.Vectors;
import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix;
import org.apache.spark.mllib.linalg.distributed.MatrixEntry;
import org.apache.spark.sql.Dataset;
@@ -128,69 +128,6 @@ public class RDDConverterUtilsExt
return coordinateMatrixToBinaryBlock(new JavaSparkContext(sc), input, mcIn, true);
}
- public static Dataset<Row> stringDataFrameToVectorDataFrame(SQLContext sqlContext, Dataset<Row> inputDF)
- throws DMLRuntimeException {
-
- StructField[] oldSchema = inputDF.schema().fields();
- //create the new schema
- StructField[] newSchema = new StructField[oldSchema.length];
- for(int i = 0; i < oldSchema.length; i++) {
- String colName = oldSchema[i].name();
- newSchema[i] = DataTypes.createStructField(colName, new VectorUDT(), true);
- }
-
- //converter
- class StringToVector implements Function<Tuple2<Row, Long>, Row> {
- private static final long serialVersionUID = -4733816995375745659L;
- @Override
- public Row call(Tuple2<Row, Long> arg0) throws Exception {
- Row oldRow = arg0._1;
- int oldNumCols = oldRow.length();
- if (oldNumCols > 1) {
- throw new DMLRuntimeException("The row must have at most one column");
- }
-
- // parse the various strings. i.e
- // ((1.2,4.3, 3.4)) or (1.2, 3.4, 2.2) or (1.2 3.4)
- // [[1.2,34.3, 1.2, 1.2]] or [1.2, 3.4] or [1.3 1.2]
- Object [] fields = new Object[oldNumCols];
- ArrayList<Object> fieldsArr = new ArrayList<Object>();
- for (int i = 0; i < oldRow.length(); i++) {
- Object ci=oldRow.get(i);
- if (ci instanceof String) {
- String cis = (String)ci;
- StringBuffer sb = new StringBuffer(cis.trim());
- for (int nid=0; i < 2; i++) { //remove two level nesting
- if ((sb.charAt(0) == '(' && sb.charAt(sb.length() - 1) == ')') ||
- (sb.charAt(0) == '[' && sb.charAt(sb.length() - 1) == ']')
- ) {
- sb.deleteCharAt(0);
- sb.setLength(sb.length() - 1);
- }
- }
- //have the replace code
- String ncis = "[" + sb.toString().replaceAll(" *, *", ",") + "]";
- Vector v = Vectors.parse(ncis);
- fieldsArr.add(v);
- } else {
- throw new DMLRuntimeException("Only String is supported");
- }
- }
- Row row = RowFactory.create(fieldsArr.toArray());
- return row;
- }
- }
-
- //output DF
- JavaRDD<Row> newRows = inputDF.rdd().toJavaRDD().zipWithIndex().map(new StringToVector());
- // DataFrame outDF = sqlContext.createDataFrame(newRows, new StructType(newSchema)); //TODO investigate why it doesn't work
- Dataset<Row> outDF = sqlContext.createDataFrame(newRows.rdd(),
- DataTypes.createStructType(newSchema));
-
- return outDF;
- }
-
-
public static Dataset<Row> projectColumns(Dataset<Row> df, ArrayList<String> columns) throws DMLRuntimeException {
ArrayList<String> columnToSelect = new ArrayList<String>();
for(int i = 1; i < columns.size(); i++) {
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
----------------------------------------------------------------------
diff --git a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
index 18eadec..c0e3f35 100644
--- a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
+++ b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
@@ -112,8 +112,8 @@ class LogisticRegressionModel(override val uid: String)(
object LogisticRegressionExample {
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.sql.types._
- import org.apache.spark.mllib.linalg.Vectors
- import org.apache.spark.mllib.regression.LabeledPoint
+ import org.apache.spark.ml.linalg.Vectors
+ import org.apache.spark.ml.feature.LabeledPoint
def main(args: Array[String]) = {
val sparkConf: SparkConf = new SparkConf();
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java
index b152b58..c6d2251 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java
@@ -25,8 +25,8 @@ import java.util.List;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.DenseVector;
-import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.DenseVector;
+import org.apache.spark.ml.linalg.VectorUDT;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java
index 6ab0fd0..14ed4b7 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java
@@ -27,8 +27,8 @@ import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.DenseVector;
-import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.DenseVector;
+import org.apache.spark.ml.linalg.VectorUDT;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java b/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java
index 89241c5..2241ad1 100644
--- a/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java
@@ -46,9 +46,9 @@ import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.VectorUDT;
-import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.ml.linalg.Vector;
+import org.apache.spark.ml.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.Vectors;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala
----------------------------------------------------------------------
diff --git a/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala b/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala
index 068db91..555d0a2 100644
--- a/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala
+++ b/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala
@@ -21,13 +21,13 @@ package org.apache.sysml.api.ml
import org.scalatest.FunSuite
import org.scalatest.Matchers
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
-import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.ml.linalg.Vector
import scala.reflect.runtime.universe._
case class LabeledDocument[T:TypeTag](id: Long, text: String, label: Double)