You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by de...@apache.org on 2017/02/04 03:34:55 UTC

incubator-systemml git commit: [SYSTEMML-1224] Migrate Vector and LabeledPoint classes from mllib to ml

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 4049ce407 -> 578e595fd


[SYSTEMML-1224] Migrate Vector and LabeledPoint classes from mllib to ml

Migrate:
mllib.linalg.DenseVector to ml.linalg.DenseVector.
mllib.linalg.Vector to ml.linalg.Vector.
mllib.linalg.Vectors to ml.linalg.Vectors.
mllib.linalg.VectorUDT to ml.linalg.VectorUDT.
mllib.regression.LabeledPoint to ml.feature.LabeledPoint.

Closes #369.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/578e595f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/578e595f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/578e595f

Branch: refs/heads/master
Commit: 578e595fdc506fb8a0c0b18c312fe420a406276d
Parents: 4049ce4
Author: Deron Eriksson <de...@us.ibm.com>
Authored: Fri Feb 3 19:31:44 2017 -0800
Committer: Deron Eriksson <de...@us.ibm.com>
Committed: Fri Feb 3 19:31:44 2017 -0800

----------------------------------------------------------------------
 .../java/org/apache/sysml/api/MLOutput.java     |  2 +-
 .../api/mlcontext/MLContextConversionUtil.java  |  2 +-
 .../sysml/api/mlcontext/MLContextUtil.java      |  2 +-
 .../spark/utils/FrameRDDConverterUtils.java     |  4 +-
 .../spark/utils/RDDConverterUtils.java          | 10 +--
 .../spark/utils/RDDConverterUtilsExt.java       | 69 +-------------------
 .../sysml/api/ml/LogisticRegression.scala       |  4 +-
 .../DataFrameVectorFrameConversionTest.java     |  4 +-
 .../mlcontext/DataFrameVectorScriptTest.java    |  4 +-
 .../integration/mlcontext/MLContextTest.java    |  6 +-
 .../sysml/api/ml/LogisticRegressionSuite.scala  |  6 +-
 11 files changed, 25 insertions(+), 88 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/api/MLOutput.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/MLOutput.java b/src/main/java/org/apache/sysml/api/MLOutput.java
index 6acca68..08a9a00 100644
--- a/src/main/java/org/apache/sysml/api/MLOutput.java
+++ b/src/main/java/org/apache/sysml/api/MLOutput.java
@@ -108,7 +108,7 @@ public class MLOutput {
 	 * Obtain the DataFrame
 	 * @param sqlContext the SQLContext
 	 * @param varName the variable name
-	 * @param outputVector if true, returns DataFrame with two column: ID and org.apache.spark.mllib.linalg.Vector
+	 * @param outputVector if true, returns DataFrame with two column: ID and org.apache.spark.ml.linalg.Vector
 	 * @return the DataFrame
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
index ca853ef..cca9d2c 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
@@ -33,7 +33,7 @@ import org.apache.spark.SparkContext;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.VectorUDT;
 import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java b/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java
index 75e9c1e..9553acb 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java
@@ -39,7 +39,7 @@ import org.apache.spark.SparkContext;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.VectorUDT;
 import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java
index 3196f09..ae3b686 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java
@@ -37,8 +37,8 @@ import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.api.java.function.Function;
 import org.apache.spark.api.java.function.PairFlatMapFunction;
 import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.Vector;
+import org.apache.spark.ml.linalg.VectorUDT;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java
index 356d16f..b5a4b58 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java
@@ -35,11 +35,11 @@ import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.api.java.function.Function;
 import org.apache.spark.api.java.function.PairFlatMapFunction;
 import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.mllib.linalg.DenseVector;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.VectorUDT;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.ml.linalg.DenseVector;
+import org.apache.spark.ml.linalg.Vector;
+import org.apache.spark.ml.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.ml.feature.LabeledPoint;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
index f18c0a9..e0d347f 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
@@ -35,9 +35,9 @@ import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.Function;
 import org.apache.spark.api.java.function.PairFlatMapFunction;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.VectorUDT;
-import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.ml.linalg.Vector;
+import org.apache.spark.ml.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.Vectors;
 import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix;
 import org.apache.spark.mllib.linalg.distributed.MatrixEntry;
 import org.apache.spark.sql.Dataset;
@@ -128,69 +128,6 @@ public class RDDConverterUtilsExt
 		return coordinateMatrixToBinaryBlock(new JavaSparkContext(sc), input, mcIn, true);
 	}
 
-	public static Dataset<Row> stringDataFrameToVectorDataFrame(SQLContext sqlContext, Dataset<Row> inputDF)
-			throws DMLRuntimeException {
-
-		StructField[] oldSchema = inputDF.schema().fields();
-		//create the new schema
-		StructField[] newSchema = new StructField[oldSchema.length];
-		for(int i = 0; i < oldSchema.length; i++) {
-			String colName = oldSchema[i].name();
-			newSchema[i] = DataTypes.createStructField(colName, new VectorUDT(), true);
-		}
-
-		//converter
-		class StringToVector implements Function<Tuple2<Row, Long>, Row> {
-			private static final long serialVersionUID = -4733816995375745659L;
-			@Override
-			public Row call(Tuple2<Row, Long> arg0) throws Exception {
-				Row oldRow = arg0._1;
-				int oldNumCols = oldRow.length();
-				if (oldNumCols > 1) {
-					throw new DMLRuntimeException("The row must have at most one column");
-				}
-
-				// parse the various strings. i.e
-				// ((1.2,4.3, 3.4))  or (1.2, 3.4, 2.2) or (1.2 3.4)
-				// [[1.2,34.3, 1.2, 1.2]] or [1.2, 3.4] or [1.3 1.2]
-				Object [] fields = new Object[oldNumCols];
-				ArrayList<Object> fieldsArr = new ArrayList<Object>();
-				for (int i = 0; i < oldRow.length(); i++) {
-					Object ci=oldRow.get(i);
-					if (ci instanceof String) {
-						String cis = (String)ci;
-						StringBuffer sb = new StringBuffer(cis.trim());
-						for (int nid=0; i < 2; i++) { //remove two level nesting
-							if ((sb.charAt(0) == '(' && sb.charAt(sb.length() - 1) == ')') ||
-									(sb.charAt(0) == '[' && sb.charAt(sb.length() - 1) == ']')
-									) {
-								sb.deleteCharAt(0);
-								sb.setLength(sb.length() - 1);
-							}
-						}
-						//have the replace code
-						String ncis = "[" + sb.toString().replaceAll(" *, *", ",") + "]";
-						Vector v = Vectors.parse(ncis);
-						fieldsArr.add(v);
-					} else {
-						throw new DMLRuntimeException("Only String is supported");
-					}
-				}
-				Row row = RowFactory.create(fieldsArr.toArray());
-				return row;
-			}
-		}
-
-		//output DF
-		JavaRDD<Row> newRows = inputDF.rdd().toJavaRDD().zipWithIndex().map(new StringToVector());
-		// DataFrame outDF = sqlContext.createDataFrame(newRows, new StructType(newSchema)); //TODO investigate why it doesn't work
-		Dataset<Row> outDF = sqlContext.createDataFrame(newRows.rdd(),
-				DataTypes.createStructType(newSchema));
-
-		return outDF;
-	}
-
-	
 	public static Dataset<Row> projectColumns(Dataset<Row> df, ArrayList<String> columns) throws DMLRuntimeException {
 		ArrayList<String> columnToSelect = new ArrayList<String>();
 		for(int i = 1; i < columns.size(); i++) {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
----------------------------------------------------------------------
diff --git a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
index 18eadec..c0e3f35 100644
--- a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
+++ b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
@@ -112,8 +112,8 @@ class LogisticRegressionModel(override val uid: String)(
 object LogisticRegressionExample {
   import org.apache.spark.{ SparkConf, SparkContext }
   import org.apache.spark.sql.types._
-  import org.apache.spark.mllib.linalg.Vectors
-  import org.apache.spark.mllib.regression.LabeledPoint
+  import org.apache.spark.ml.linalg.Vectors
+  import org.apache.spark.ml.feature.LabeledPoint
 
   def main(args: Array[String]) = {
     val sparkConf: SparkConf = new SparkConf();

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java
index b152b58..c6d2251 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java
@@ -25,8 +25,8 @@ import java.util.List;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.DenseVector;
-import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.DenseVector;
+import org.apache.spark.ml.linalg.VectorUDT;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java
index 6ab0fd0..14ed4b7 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java
@@ -27,8 +27,8 @@ import java.util.List;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.DenseVector;
-import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.DenseVector;
+import org.apache.spark.ml.linalg.VectorUDT;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java b/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java
index 89241c5..2241ad1 100644
--- a/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java
@@ -46,9 +46,9 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.Function;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.VectorUDT;
-import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.ml.linalg.Vector;
+import org.apache.spark.ml.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.Vectors;
 import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala
----------------------------------------------------------------------
diff --git a/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala b/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala
index 068db91..555d0a2 100644
--- a/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala
+++ b/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala
@@ -21,13 +21,13 @@ package org.apache.sysml.api.ml
 
 import org.scalatest.FunSuite
 import org.scalatest.Matchers
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.feature.LabeledPoint
 import org.apache.spark.ml.Pipeline
 import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
 import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
 import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
-import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.ml.linalg.Vector
 import scala.reflect.runtime.universe._
 
 case class LabeledDocument[T:TypeTag](id: Long, text: String, label: Double)