You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by de...@apache.org on 2016/08/17 04:42:08 UTC
incubator-systemml git commit: [SYSTEMML-474] MLContext matrix from
URL
Repository: incubator-systemml
Updated Branches:
refs/heads/master 50541d32e -> e6db69b87
[SYSTEMML-474] MLContext matrix from URL
Initial support reading IJV and CSV matrices into MLContext via URLs.
Closes #210.
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/e6db69b8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/e6db69b8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/e6db69b8
Branch: refs/heads/master
Commit: e6db69b873a3ac8be022f5ba7273d37124e34497
Parents: 50541d3
Author: Deron Eriksson <de...@us.ibm.com>
Authored: Tue Aug 16 21:38:27 2016 -0700
Committer: Deron Eriksson <de...@us.ibm.com>
Committed: Tue Aug 16 21:38:27 2016 -0700
----------------------------------------------------------------------
docs/_config.yml | 2 +-
docs/spark-mlcontext-programming-guide.md | 101 +++++++++++++++++++
pom.xml | 1 +
.../api/mlcontext/MLContextConversionUtil.java | 79 +++++++++++++--
.../sysml/api/mlcontext/MLContextUtil.java | 35 ++-----
.../integration/mlcontext/MLContextTest.java | 52 +++++++++-
.../org/apache/sysml/api/mlcontext/1234.ijv | 4 +
.../org/apache/sysml/api/mlcontext/1234.ijv.mtd | 11 ++
8 files changed, 248 insertions(+), 37 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e6db69b8/docs/_config.yml
----------------------------------------------------------------------
diff --git a/docs/_config.yml b/docs/_config.yml
index 1a09658..2ef66a0 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -11,7 +11,7 @@ include:
- _modules
# These allow the documentation to be updated with newer releases
-SYSTEMML_VERSION: 0.11.0
+SYSTEMML_VERSION: 0.10.x
# if 'analytics_on' is true, analytics section will be rendered on the HTML pages
analytics_on: true
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e6db69b8/docs/spark-mlcontext-programming-guide.md
----------------------------------------------------------------------
diff --git a/docs/spark-mlcontext-programming-guide.md b/docs/spark-mlcontext-programming-guide.md
index 2f77347..71db1a4 100644
--- a/docs/spark-mlcontext-programming-guide.md
+++ b/docs/spark-mlcontext-programming-guide.md
@@ -662,6 +662,107 @@ None
</div>
+Alternatively, we could supply a `java.net.URL` to the Script `in` method. Note that if the URL matrix data is in IJV
+format, metadata needs to be supplied for the matrix.
+
+<div class="codetabs">
+
+<div data-lang="Scala" markdown="1">
+{% highlight scala %}
+val habermanUrl = "http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data"
+val typesRDD = sc.parallelize(Array("1.0,1.0,1.0,2.0"))
+val scriptUrl = "https://raw.githubusercontent.com/apache/incubator-systemml/master/scripts/algorithms/Univar-Stats.dml"
+val uni = dmlFromUrl(scriptUrl).in("A", new java.net.URL(habermanUrl)).in("K", typesRDD).in("$CONSOLE_OUTPUT", true)
+ml.execute(uni)
+{% endhighlight %}
+</div>
+
+<div data-lang="Spark Shell" markdown="1">
+{% highlight scala %}
+scala> val habermanUrl = "http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data"
+habermanUrl: String = http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data
+
+scala> val typesRDD = sc.parallelize(Array("1.0,1.0,1.0,2.0"))
+typesRDD: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[50] at parallelize at <console>:33
+
+scala> val scriptUrl = "https://raw.githubusercontent.com/apache/incubator-systemml/master/scripts/algorithms/Univar-Stats.dml"
+scriptUrl: String = https://raw.githubusercontent.com/apache/incubator-systemml/master/scripts/algorithms/Univar-Stats.dml
+
+scala> val uni = dmlFromUrl(scriptUrl).in("A", new java.net.URL(habermanUrl)).in("K", typesRDD).in("$CONSOLE_OUTPUT", true)
+uni: org.apache.sysml.api.mlcontext.Script =
+Inputs:
+ [1] (URL) A: http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data
+ [2] (RDD) K: ParallelCollectionRDD[50] at parallelize at <console>:33
+ [3] (Boolean) $CONSOLE_OUTPUT: true
+
+Outputs:
+None
+
+
+scala> ml.execute(uni)
+...
+-------------------------------------------------
+ (01) Minimum | 30.0
+ (02) Maximum | 83.0
+ (03) Range | 53.0
+ (04) Mean | 52.45751633986928
+ (05) Variance | 116.71458266366658
+ (06) Std deviation | 10.803452349303281
+ (07) Std err of mean | 0.6175922641866753
+ (08) Coeff of variation | 0.20594669940735139
+ (09) Skewness | 0.1450718616532357
+ (10) Kurtosis | -0.6150152487211726
+ (11) Std err of skewness | 0.13934809593495995
+ (12) Std err of kurtosis | 0.277810485320835
+ (13) Median | 52.0
+ (14) Interquartile mean | 52.16013071895425
+Feature [1]: Scale
+-------------------------------------------------
+ (01) Minimum | 58.0
+ (02) Maximum | 69.0
+ (03) Range | 11.0
+ (04) Mean | 62.85294117647059
+ (05) Variance | 10.558630665380907
+ (06) Std deviation | 3.2494046632238507
+ (07) Std err of mean | 0.18575610076612029
+ (08) Coeff of variation | 0.051698529971741194
+ (09) Skewness | 0.07798443581479181
+ (10) Kurtosis | -1.1324380182967442
+ (11) Std err of skewness | 0.13934809593495995
+ (12) Std err of kurtosis | 0.277810485320835
+ (13) Median | 63.0
+ (14) Interquartile mean | 62.80392156862745
+Feature [2]: Scale
+-------------------------------------------------
+ (01) Minimum | 0.0
+ (02) Maximum | 52.0
+ (03) Range | 52.0
+ (04) Mean | 4.026143790849673
+ (05) Variance | 51.691117539912135
+ (06) Std deviation | 7.189653506248555
+ (07) Std err of mean | 0.41100513466216837
+ (08) Coeff of variation | 1.7857418611299172
+ (09) Skewness | 2.954633471088322
+ (10) Kurtosis | 11.425776549251449
+ (11) Std err of skewness | 0.13934809593495995
+ (12) Std err of kurtosis | 0.277810485320835
+ (13) Median | 1.0
+ (14) Interquartile mean | 1.2483660130718954
+Feature [3]: Scale
+-------------------------------------------------
+Feature [4]: Categorical (Nominal)
+ (15) Num of categories | 2
+ (16) Mode | 1
+ (17) Num of modes | 1
+res5: org.apache.sysml.api.mlcontext.MLResults =
+None
+
+{% endhighlight %}
+</div>
+
+</div>
+
+
### Input Variables vs Input Parameters
If we examine the
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e6db69b8/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index b271cf7..a4c66a1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -577,6 +577,7 @@
<exclude>docs</exclude>
<exclude>**/docs/**</exclude>
<exclude>**/*.csv</exclude>
+ <exclude>**/*.ijv</exclude>
<exclude>**/*.json</exclude>
<exclude>**/*.mtx</exclude>
<exclude>**/*.mtd</exclude>
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e6db69b8/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
index 161ad17..33a5a3c 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
@@ -19,10 +19,13 @@
package org.apache.sysml.api.mlcontext;
+import java.io.InputStream;
+import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
+import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.spark.Accumulator;
@@ -118,6 +121,71 @@ public class MLContextConversionUtil {
}
/**
+ * Convert a matrix at a URL to a {@code MatrixObject}.
+ *
+ * @param variableName
+ * name of the variable associated with the matrix
+ * @param url
+ * the URL to a matrix (in CSV or IJV format)
+ * @param matrixMetadata
+ * the matrix metadata
+ * @return the matrix at a URL converted to a {@code MatrixObject}
+ */
+ public static MatrixObject urlToMatrixObject(String variableName, URL url, MatrixMetadata matrixMetadata) {
+ try {
+ InputStream is = url.openStream();
+ List<String> lines = IOUtils.readLines(is);
+ MLContext activeMLContext = (MLContext) MLContextProxy.getActiveMLContext();
+ SparkContext sparkContext = activeMLContext.getSparkContext();
+ @SuppressWarnings("resource")
+ JavaSparkContext javaSparkContext = new JavaSparkContext(sparkContext);
+ JavaRDD<String> javaRDD = javaSparkContext.parallelize(lines);
+ if ((matrixMetadata == null) || (matrixMetadata.getMatrixFormat() == MatrixFormat.CSV)) {
+ MatrixObject matrixObject = javaRDDStringCSVToMatrixObject(variableName, javaRDD, matrixMetadata);
+ return matrixObject;
+ } else if (matrixMetadata.getMatrixFormat() == MatrixFormat.IJV) {
+ MatrixObject matrixObject = javaRDDStringIJVToMatrixObject(variableName, javaRDD, matrixMetadata);
+ return matrixObject;
+ }
+ return null;
+ } catch (Exception e) {
+ throw new MLContextException("Exception converting URL to MatrixObject", e);
+ }
+ }
+
+ /**
+ * Convert a {@code MatrixBlock} to a {@code MatrixObject}.
+ *
+ * @param variableName
+ * name of the variable associated with the matrix
+ * @param matrixBlock
+ * matrix as a MatrixBlock
+ * @param matrixMetadata
+ * the matrix metadata
+ * @return the {@code MatrixBlock} converted to a {@code MatrixObject}
+ */
+ public static MatrixObject matrixBlockToMatrixObject(String variableName, MatrixBlock matrixBlock,
+ MatrixMetadata matrixMetadata) {
+ try {
+ MatrixCharacteristics matrixCharacteristics;
+ if (matrixMetadata != null) {
+ matrixCharacteristics = matrixMetadata.asMatrixCharacteristics();
+ } else {
+ matrixCharacteristics = new MatrixCharacteristics();
+ }
+ MatrixFormatMetaData mtd = new MatrixFormatMetaData(matrixCharacteristics,
+ OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo);
+ MatrixObject matrixObject = new MatrixObject(ValueType.DOUBLE, MLContextUtil.scratchSpace() + "/"
+ + variableName, mtd);
+ matrixObject.acquireModify(matrixBlock);
+ matrixObject.release();
+ return matrixObject;
+ } catch (CacheException e) {
+ throw new MLContextException("Exception converting MatrixBlock to MatrixObject", e);
+ }
+ }
+
+ /**
* Convert a {@code JavaPairRDD<MatrixIndexes, MatrixBlock>} to a
* {@code MatrixObject}.
*
@@ -687,16 +755,13 @@ public class MLContextConversionUtil {
SparkContext sc = activeMLContext.getSparkContext();
SQLContext sqlContext = new SQLContext(sc);
DataFrame df = null;
- if(isVectorDF) {
+ if (isVectorDF) {
df = RDDConverterUtilsExt.binaryBlockToVectorDataFrame(binaryBlockMatrix, matrixCharacteristics,
sqlContext);
+ } else {
+ df = RDDConverterUtilsExt.binaryBlockToDataFrame(binaryBlockMatrix, matrixCharacteristics, sqlContext);
}
- else {
- df = RDDConverterUtilsExt.binaryBlockToDataFrame(binaryBlockMatrix, matrixCharacteristics,
- sqlContext);
- }
-
-
+
return df;
} catch (DMLRuntimeException e) {
throw new MLContextException("DMLRuntimeException while converting matrix object to DataFrame", e);
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e6db69b8/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java b/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java
index fc942e9..ea7857e 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java
@@ -20,6 +20,7 @@
package org.apache.sysml.api.mlcontext;
import java.io.FileNotFoundException;
+import java.net.URL;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
@@ -44,9 +45,7 @@ import org.apache.sysml.conf.CompilerConfig.ConfigType;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.conf.DMLConfig;
import org.apache.sysml.parser.ParseException;
-import org.apache.sysml.parser.Expression.ValueType;
import org.apache.sysml.runtime.controlprogram.LocalVariableMap;
-import org.apache.sysml.runtime.controlprogram.caching.CacheException;
import org.apache.sysml.runtime.controlprogram.caching.FrameObject;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysml.runtime.instructions.cp.BooleanObject;
@@ -54,12 +53,8 @@ import org.apache.sysml.runtime.instructions.cp.Data;
import org.apache.sysml.runtime.instructions.cp.DoubleObject;
import org.apache.sysml.runtime.instructions.cp.IntObject;
import org.apache.sysml.runtime.instructions.cp.StringObject;
-import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
-import org.apache.sysml.runtime.matrix.MatrixFormatMetaData;
-import org.apache.sysml.runtime.matrix.data.InputInfo;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
-import org.apache.sysml.runtime.matrix.data.OutputInfo;
/**
* Utility class containing methods for working with the MLContext API.
@@ -78,7 +73,7 @@ public final class MLContextUtil {
*/
@SuppressWarnings("rawtypes")
public static final Class[] COMPLEX_DATA_TYPES = { JavaRDD.class, RDD.class, DataFrame.class,
- BinaryBlockMatrix.class, Matrix.class, (new double[][] {}).getClass(), MatrixBlock.class };
+ BinaryBlockMatrix.class, Matrix.class, (new double[][] {}).getClass(), MatrixBlock.class, URL.class };
/**
* All data types supported by the MLContext API
@@ -343,7 +338,7 @@ public final class MLContextUtil {
/**
* Is the object one of the supported complex data types? (JavaRDD, RDD,
- * DataFrame, BinaryBlockMatrix, Matrix, double[][])
+ * DataFrame, BinaryBlockMatrix, Matrix, double[][], MatrixBlock, URL)
*
* @param object
* the object type to be examined
@@ -457,23 +452,11 @@ public final class MLContextUtil {
return matrixObject;
} else if (value instanceof MatrixBlock) {
- MatrixCharacteristics matrixCharacteristics;
- if (matrixMetadata != null) {
- matrixCharacteristics = matrixMetadata.asMatrixCharacteristics();
- } else {
- matrixCharacteristics = new MatrixCharacteristics();
- }
- MatrixFormatMetaData mtd = new MatrixFormatMetaData(matrixCharacteristics, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo);
- MatrixObject matrixObject = new MatrixObject(ValueType.DOUBLE, MLContextUtil.scratchSpace() + "/" + name, mtd);
- try {
- matrixObject.acquireModify((MatrixBlock)value);
- matrixObject.release();
- } catch (CacheException e) {
- throw new MLContextException(e);
- }
+ MatrixBlock matrixBlock = (MatrixBlock) value;
+ MatrixObject matrixObject = MLContextConversionUtil.matrixBlockToMatrixObject(name, matrixBlock,
+ matrixMetadata);
return matrixObject;
- }
- else if (value instanceof DataFrame) {
+ } else if (value instanceof DataFrame) {
DataFrame dataFrame = (DataFrame) value;
MatrixObject matrixObject = MLContextConversionUtil
.dataFrameToMatrixObject(name, dataFrame, matrixMetadata);
@@ -496,6 +479,10 @@ public final class MLContextUtil {
MatrixObject matrixObject = MLContextConversionUtil.doubleMatrixToMatrixObject(name, doubleMatrix,
matrixMetadata);
return matrixObject;
+ } else if (value instanceof URL) {
+ URL url = (URL) value;
+ MatrixObject matrixObject = MLContextConversionUtil.urlToMatrixObject(name, url, matrixMetadata);
+ return matrixObject;
} else if (value instanceof Integer) {
Integer i = (Integer) value;
IntObject iObj = new IntObject(i);
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e6db69b8/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java b/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java
index 4afd275..e6e1046 100644
--- a/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java
@@ -93,7 +93,7 @@ public class MLContextTest extends AutomatedTestBase {
sc = new JavaSparkContext(conf);
ml = new MLContext(sc);
}
-
+
@Override
public void setUp() {
addTestConfiguration(TEST_DIR, TEST_NAME);
@@ -1641,6 +1641,48 @@ public class MLContextTest extends AutomatedTestBase {
ml.execute(script);
}
+ @Test
+ public void testCSVMatrixFromURLSumDML() throws MalformedURLException {
+ System.out.println("MLContextTest - CSV matrix from URL sum DML");
+ String csv = "https://raw.githubusercontent.com/apache/incubator-systemml/master/src/test/scripts/org/apache/sysml/api/mlcontext/1234.csv";
+ URL url = new URL(csv);
+ Script script = dml("print('sum: ' + sum(M));").in("M", url);
+ setExpectedStdOut("sum: 10.0");
+ ml.execute(script);
+ }
+
+ @Test
+ public void testCSVMatrixFromURLSumPYDML() throws MalformedURLException {
+ System.out.println("MLContextTest - CSV matrix from URL sum PYDML");
+ String csv = "https://raw.githubusercontent.com/apache/incubator-systemml/master/src/test/scripts/org/apache/sysml/api/mlcontext/1234.csv";
+ URL url = new URL(csv);
+ Script script = pydml("print('sum: ' + sum(M))").in("M", url);
+ setExpectedStdOut("sum: 10.0");
+ ml.execute(script);
+ }
+
+ @Test
+ public void testIJVMatrixFromURLSumDML() throws MalformedURLException {
+ System.out.println("MLContextTest - IJV matrix from URL sum DML");
+ String ijv = "https://raw.githubusercontent.com/apache/incubator-systemml/master/src/test/scripts/org/apache/sysml/api/mlcontext/1234.ijv";
+ URL url = new URL(ijv);
+ MatrixMetadata mm = new MatrixMetadata(MatrixFormat.IJV, 2, 2);
+ Script script = dml("print('sum: ' + sum(M));").in("M", url, mm);
+ setExpectedStdOut("sum: 10.0");
+ ml.execute(script);
+ }
+
+ @Test
+ public void testIJVMatrixFromURLSumPYDML() throws MalformedURLException {
+ System.out.println("MLContextTest - IJV matrix from URL sum PYDML");
+ String ijv = "https://raw.githubusercontent.com/apache/incubator-systemml/master/src/test/scripts/org/apache/sysml/api/mlcontext/1234.ijv";
+ URL url = new URL(ijv);
+ MatrixMetadata mm = new MatrixMetadata(MatrixFormat.IJV, 2, 2);
+ Script script = pydml("print('sum: ' + sum(M))").in("M", url, mm);
+ setExpectedStdOut("sum: 10.0");
+ ml.execute(script);
+ }
+
// NOTE: Uncomment these tests once they work
// @SuppressWarnings({ "rawtypes", "unchecked" })
@@ -1714,13 +1756,13 @@ public class MLContextTest extends AutomatedTestBase {
@AfterClass
public static void tearDownClass() {
- //stop spark context to allow single jvm tests (otherwise the
- //next test that tries to create a SparkContext would fail)
+ // stop spark context to allow single jvm tests (otherwise the
+ // next test that tries to create a SparkContext would fail)
sc.stop();
sc = null;
conf = null;
-
- //clear status mlcontext and spark exec context
+
+ // clear status mlcontext and spark exec context
ml.close();
ml = null;
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e6db69b8/src/test/scripts/org/apache/sysml/api/mlcontext/1234.ijv
----------------------------------------------------------------------
diff --git a/src/test/scripts/org/apache/sysml/api/mlcontext/1234.ijv b/src/test/scripts/org/apache/sysml/api/mlcontext/1234.ijv
new file mode 100644
index 0000000..ddf4c44
--- /dev/null
+++ b/src/test/scripts/org/apache/sysml/api/mlcontext/1234.ijv
@@ -0,0 +1,4 @@
+1 1 1.0
+1 2 2.0
+2 1 3.0
+2 2 4.0
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e6db69b8/src/test/scripts/org/apache/sysml/api/mlcontext/1234.ijv.mtd
----------------------------------------------------------------------
diff --git a/src/test/scripts/org/apache/sysml/api/mlcontext/1234.ijv.mtd b/src/test/scripts/org/apache/sysml/api/mlcontext/1234.ijv.mtd
new file mode 100644
index 0000000..d459138
--- /dev/null
+++ b/src/test/scripts/org/apache/sysml/api/mlcontext/1234.ijv.mtd
@@ -0,0 +1,11 @@
+{
+ "data_type": "matrix",
+ "value_type": "double",
+ "rows": 2,
+ "cols": 2,
+ "nnz": 4,
+ "format": "text",
+ "description": {
+ "author": "SystemML"
+ }
+}
\ No newline at end of file