You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2015/12/09 21:22:02 UTC
incubator-systemml git commit: Adding ID support for DataFrame
required for MLContext
Repository: incubator-systemml
Updated Branches:
refs/heads/master 301977fc3 -> abd19df94
Adding ID support for DataFrame required for MLContext
This change also works for older Spark 1.3.0.
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/abd19df9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/abd19df9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/abd19df9
Branch: refs/heads/master
Commit: abd19df94ecdb54cd764cce7793340711f858811
Parents: 301977f
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Wed Dec 9 12:22:02 2015 -0800
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Wed Dec 9 12:22:02 2015 -0800
----------------------------------------------------------------------
.../spark/utils/RDDConverterUtilsExt.java | 29 ++++++++++++++++++--
1 file changed, 26 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/abd19df9/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
index 4b8fdde..e64fb89 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
@@ -24,6 +24,7 @@ import java.io.Serializable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
+import java.util.Scanner;
import org.apache.hadoop.io.Text;
import org.apache.spark.Accumulator;
@@ -65,6 +66,7 @@ import org.apache.sysml.runtime.util.UtilFunctions;
* NOTE: These are experimental converter utils. Once thoroughly tested, they
* can be moved to RDDConverterUtils.
*/
+@SuppressWarnings("unused")
public class RDDConverterUtilsExt
{
public enum RDDConverterTypes {
@@ -180,9 +182,30 @@ public class RDDConverterUtilsExt
DataFrame df, MatrixCharacteristics mcOut, boolean containsID)
throws DMLRuntimeException {
if(containsID) {
- // Uncomment this when we move to Spark 1.4.0 or higher
- // df = df.sort("ID").drop("ID");
- throw new DMLRuntimeException("containsID is not supported yet");
+ ArrayList<String> columnToSelect = new ArrayList<String>();
+ String firstCol = null;
+ boolean colIDPresent = false;
+ for(String col : df.columns()) {
+ if(col.compareTo("ID") == 0) {
+ colIDPresent = true;
+ }
+ else if(firstCol == null) {
+ firstCol = col;
+ }
+ else {
+ columnToSelect.add(col);
+ }
+ }
+
+ if(!colIDPresent) {
+ throw new DMLRuntimeException("The column \"ID\" is not present in the dataframe.");
+ }
+ else if(firstCol == null) {
+ throw new DMLRuntimeException("No column other than \"ID\" present in the dataframe.");
+ }
+
+ // Round about way to do in Java (not exposed in Spark 1.3.0): df = df.sort("ID").drop("ID");
+ df = df.sort("ID").select(firstCol, scala.collection.JavaConversions.asScalaBuffer(columnToSelect).toList());
}
//determine unknown dimensions and sparsity if required