You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ignite.apache.org by ch...@apache.org on 2019/01/23 11:41:59 UTC

[ignite] branch master updated: IGNITE-11001: [ML] Add parser for Spark Linear SVM model

This is an automated email from the ASF dual-hosted git repository.

chief pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ignite.git


The following commit(s) were added to refs/heads/master by this push:
     new d727fd4  IGNITE-11001: [ML] Add parser for Spark Linear SVM model
d727fd4 is described below

commit d727fd4e687b13a62d2d9220561cee6b88f6147f
Author: zaleslaw <za...@gmail.com>
AuthorDate: Wed Jan 23 14:41:39 2019 +0300

    IGNITE-11001: [ML] Add parser for Spark Linear SVM model
    
    This closes #5893
---
 .../LinearRegressionFromSparkExample.java          |   6 +-
 .../spark/modelparser/LogRegFromSparkExample.java  |   4 +-
 ...mSparkExample.java => SVMFromSparkExample.java} |  24 ++++----
 .../models/spark/serialized/svm/data/._SUCCESS.crc | Bin 0 -> 8 bytes
 ...-4948-8e65-29c9f5c9c5b2-c000.snappy.parquet.crc | Bin 0 -> 24 bytes
 .../models/spark/serialized/svm/data/_SUCCESS      |   0
 ...a36c-4948-8e65-29c9f5c9c5b2-c000.snappy.parquet | Bin 0 -> 1562 bytes
 .../spark/serialized/svm/metadata/._SUCCESS.crc    | Bin 0 -> 8 bytes
 .../spark/serialized/svm/metadata/.part-00000.crc  | Bin 0 -> 12 bytes
 .../models/spark/serialized/svm/metadata/_SUCCESS  |   0
 .../spark/serialized/svm/metadata/part-00000       |   1 +
 .../ml/sparkmodelparser/SparkModelParser.java      |  65 +++++++++++++++++++++
 .../ml/sparkmodelparser/SupportedSparkModels.java  |   5 +-
 13 files changed, 87 insertions(+), 18 deletions(-)

diff --git a/examples/src/main/java/org/apache/ignite/examples/ml/inference/spark/modelparser/LinearRegressionFromSparkExample.java b/examples/src/main/java/org/apache/ignite/examples/ml/inference/spark/modelparser/LinearRegressionFromSparkExample.java
index e37bf8b..41aaa88 100644
--- a/examples/src/main/java/org/apache/ignite/examples/ml/inference/spark/modelparser/LinearRegressionFromSparkExample.java
+++ b/examples/src/main/java/org/apache/ignite/examples/ml/inference/spark/modelparser/LinearRegressionFromSparkExample.java
@@ -39,8 +39,8 @@ import org.apache.ignite.ml.sparkmodelparser.SupportedSparkModels;
  * You can change the test data used in this example and re-run it to explore this algorithm further.</p>
  */
 public class LinearRegressionFromSparkExample {
-    /** Path to Spark LogReg model. */
-    public static final String SPARK_LINEAR_REG_MDL_PATH = "examples/src/main/resources/models/spark/serialized/linreg/data" +
+    /** Path to Spark linear regression model. */
+    public static final String SPARK_MDL_PATH = "examples/src/main/resources/models/spark/serialized/linreg/data" +
         "/part-00000-1ff2d09d-6cdf-4ad3-bddd-7cad8378429d-c000.snappy.parquet";
 
     /** Run example. */
@@ -65,7 +65,7 @@ public class LinearRegressionFromSparkExample {
             IgniteBiFunction<Integer, Object[], Double> lbExtractor = (k, v) -> (double)v[4];
 
             LinearRegressionModel mdl = (LinearRegressionModel)SparkModelParser.parse(
-                SPARK_LINEAR_REG_MDL_PATH,
+                SPARK_MDL_PATH,
                 SupportedSparkModels.LINEAR_REGRESSION
             );
 
diff --git a/examples/src/main/java/org/apache/ignite/examples/ml/inference/spark/modelparser/LogRegFromSparkExample.java b/examples/src/main/java/org/apache/ignite/examples/ml/inference/spark/modelparser/LogRegFromSparkExample.java
index 7581b40..bd2f801 100644
--- a/examples/src/main/java/org/apache/ignite/examples/ml/inference/spark/modelparser/LogRegFromSparkExample.java
+++ b/examples/src/main/java/org/apache/ignite/examples/ml/inference/spark/modelparser/LogRegFromSparkExample.java
@@ -39,7 +39,7 @@ import org.apache.ignite.ml.sparkmodelparser.SupportedSparkModels;
  */
 public class LogRegFromSparkExample {
     /** Path to Spark LogReg model. */
-    public static final String SPARK_LOG_REG_MDL_PATH = "examples/src/main/resources/models/spark/serialized/logreg/data" +
+    public static final String SPARK_MDL_PATH = "examples/src/main/resources/models/spark/serialized/logreg/data" +
         "/part-00000-7551081d-c0a8-4ed7-afe4-a464aabc7f80-c000.snappy.parquet";
 
     /** Run example. */
@@ -64,7 +64,7 @@ public class LogRegFromSparkExample {
             IgniteBiFunction<Integer, Object[], Double> lbExtractor = (k, v) -> (double)v[1];
 
             LogisticRegressionModel mdl = (LogisticRegressionModel) SparkModelParser.parse(
-                SPARK_LOG_REG_MDL_PATH,
+                SPARK_MDL_PATH,
                 SupportedSparkModels.LOG_REGRESSION
                 );
 
diff --git a/examples/src/main/java/org/apache/ignite/examples/ml/inference/spark/modelparser/LogRegFromSparkExample.java b/examples/src/main/java/org/apache/ignite/examples/ml/inference/spark/modelparser/SVMFromSparkExample.java
similarity index 78%
copy from examples/src/main/java/org/apache/ignite/examples/ml/inference/spark/modelparser/LogRegFromSparkExample.java
copy to examples/src/main/java/org/apache/ignite/examples/ml/inference/spark/modelparser/SVMFromSparkExample.java
index 7581b40..86aacf1 100644
--- a/examples/src/main/java/org/apache/ignite/examples/ml/inference/spark/modelparser/LogRegFromSparkExample.java
+++ b/examples/src/main/java/org/apache/ignite/examples/ml/inference/spark/modelparser/SVMFromSparkExample.java
@@ -25,27 +25,27 @@ import org.apache.ignite.examples.ml.tutorial.TitanicUtils;
 import org.apache.ignite.ml.math.functions.IgniteBiFunction;
 import org.apache.ignite.ml.math.primitives.vector.Vector;
 import org.apache.ignite.ml.math.primitives.vector.VectorUtils;
-import org.apache.ignite.ml.regressions.logistic.LogisticRegressionModel;
 import org.apache.ignite.ml.selection.scoring.evaluator.BinaryClassificationEvaluator;
 import org.apache.ignite.ml.selection.scoring.metric.Accuracy;
 import org.apache.ignite.ml.sparkmodelparser.SparkModelParser;
 import org.apache.ignite.ml.sparkmodelparser.SupportedSparkModels;
+import org.apache.ignite.ml.svm.SVMLinearClassificationModel;
 
 /**
- * Run logistic regression model loaded from snappy.parquet file.
+ * Run SVM model loaded from snappy.parquet file.
  * The snappy.parquet file was generated by Spark MLLib model.write.overwrite().save(..) operator.
  * <p>
  * You can change the test data used in this example and re-run it to explore this algorithm further.</p>
  */
-public class LogRegFromSparkExample {
-    /** Path to Spark LogReg model. */
-    public static final String SPARK_LOG_REG_MDL_PATH = "examples/src/main/resources/models/spark/serialized/logreg/data" +
-        "/part-00000-7551081d-c0a8-4ed7-afe4-a464aabc7f80-c000.snappy.parquet";
+public class SVMFromSparkExample {
+    /** Path to Spark SVM model. */
+    public static final String SPARK_MDL_PATH = "examples/src/main/resources/models/spark/serialized/svm/data" +
+        "/part-00000-b3d800e2-a36c-4948-8e65-29c9f5c9c5b2-c000.snappy.parquet";
 
     /** Run example. */
     public static void main(String[] args) throws FileNotFoundException {
         System.out.println();
-        System.out.println(">>> Logistic regression model loaded from Spark through serialization over partitioned dataset usage example started.");
+        System.out.println(">>> SVM model loaded from Spark through serialization over partitioned dataset usage example started.");
         // Start ignite grid.
         try (Ignite ignite = Ignition.start("examples/config/example-ignite.xml")) {
             System.out.println(">>> Ignite grid started.");
@@ -63,12 +63,12 @@ public class LogRegFromSparkExample {
 
             IgniteBiFunction<Integer, Object[], Double> lbExtractor = (k, v) -> (double)v[1];
 
-            LogisticRegressionModel mdl = (LogisticRegressionModel) SparkModelParser.parse(
-                SPARK_LOG_REG_MDL_PATH,
-                SupportedSparkModels.LOG_REGRESSION
-                );
+            SVMLinearClassificationModel mdl = (SVMLinearClassificationModel)SparkModelParser.parse(
+                SPARK_MDL_PATH,
+                SupportedSparkModels.LINEAR_SVM
+            );
 
-            System.out.println(">>> Logistic regression model: " + mdl);
+            System.out.println(">>> SVM: " + mdl);
 
             double accuracy = BinaryClassificationEvaluator.evaluate(
                 dataCache,
diff --git a/examples/src/main/resources/models/spark/serialized/svm/data/._SUCCESS.crc b/examples/src/main/resources/models/spark/serialized/svm/data/._SUCCESS.crc
new file mode 100644
index 0000000..3b7b044
Binary files /dev/null and b/examples/src/main/resources/models/spark/serialized/svm/data/._SUCCESS.crc differ
diff --git a/examples/src/main/resources/models/spark/serialized/svm/data/.part-00000-b3d800e2-a36c-4948-8e65-29c9f5c9c5b2-c000.snappy.parquet.crc b/examples/src/main/resources/models/spark/serialized/svm/data/.part-00000-b3d800e2-a36c-4948-8e65-29c9f5c9c5b2-c000.snappy.parquet.crc
new file mode 100644
index 0000000..12c3d06
Binary files /dev/null and b/examples/src/main/resources/models/spark/serialized/svm/data/.part-00000-b3d800e2-a36c-4948-8e65-29c9f5c9c5b2-c000.snappy.parquet.crc differ
diff --git a/examples/src/main/resources/models/spark/serialized/svm/data/_SUCCESS b/examples/src/main/resources/models/spark/serialized/svm/data/_SUCCESS
new file mode 100644
index 0000000..e69de29
diff --git a/examples/src/main/resources/models/spark/serialized/svm/data/part-00000-b3d800e2-a36c-4948-8e65-29c9f5c9c5b2-c000.snappy.parquet b/examples/src/main/resources/models/spark/serialized/svm/data/part-00000-b3d800e2-a36c-4948-8e65-29c9f5c9c5b2-c000.snappy.parquet
new file mode 100644
index 0000000..e7272ee
Binary files /dev/null and b/examples/src/main/resources/models/spark/serialized/svm/data/part-00000-b3d800e2-a36c-4948-8e65-29c9f5c9c5b2-c000.snappy.parquet differ
diff --git a/examples/src/main/resources/models/spark/serialized/svm/metadata/._SUCCESS.crc b/examples/src/main/resources/models/spark/serialized/svm/metadata/._SUCCESS.crc
new file mode 100644
index 0000000..3b7b044
Binary files /dev/null and b/examples/src/main/resources/models/spark/serialized/svm/metadata/._SUCCESS.crc differ
diff --git a/examples/src/main/resources/models/spark/serialized/svm/metadata/.part-00000.crc b/examples/src/main/resources/models/spark/serialized/svm/metadata/.part-00000.crc
new file mode 100644
index 0000000..b9c5548
Binary files /dev/null and b/examples/src/main/resources/models/spark/serialized/svm/metadata/.part-00000.crc differ
diff --git a/examples/src/main/resources/models/spark/serialized/svm/metadata/_SUCCESS b/examples/src/main/resources/models/spark/serialized/svm/metadata/_SUCCESS
new file mode 100644
index 0000000..e69de29
diff --git a/examples/src/main/resources/models/spark/serialized/svm/metadata/part-00000 b/examples/src/main/resources/models/spark/serialized/svm/metadata/part-00000
new file mode 100644
index 0000000..f52c84e
--- /dev/null
+++ b/examples/src/main/resources/models/spark/serialized/svm/metadata/part-00000
@@ -0,0 +1 @@
+{"class":"org.apache.spark.ml.classification.LinearSVCModel","timestamp":1548080818725,"sparkVersion":"2.2.0","uid":"linearsvc_04b1238e9c9d","paramMap":{"fitIntercept":true,"standardization":true,"labelCol":"survived","tol":1.0E-6,"regParam":0.3,"featuresCol":"features","maxIter":100,"predictionCol":"prediction","threshold":0.6,"rawPredictionCol":"rawPrediction","aggregationDepth":2}}
diff --git a/modules/ml/spark-model-parser/src/main/java/org/apache/ignite/ml/sparkmodelparser/SparkModelParser.java b/modules/ml/spark-model-parser/src/main/java/org/apache/ignite/ml/sparkmodelparser/SparkModelParser.java
index 04ee902..9a869e8 100644
--- a/modules/ml/spark-model-parser/src/main/java/org/apache/ignite/ml/sparkmodelparser/SparkModelParser.java
+++ b/modules/ml/spark-model-parser/src/main/java/org/apache/ignite/ml/sparkmodelparser/SparkModelParser.java
@@ -27,6 +27,7 @@ import org.apache.ignite.ml.math.primitives.vector.Vector;
 import org.apache.ignite.ml.math.primitives.vector.impl.DenseVector;
 import org.apache.ignite.ml.regressions.linear.LinearRegressionModel;
 import org.apache.ignite.ml.regressions.logistic.LogisticRegressionModel;
+import org.apache.ignite.ml.svm.SVMLinearClassificationModel;
 import org.apache.parquet.column.page.PageReadStore;
 import org.apache.parquet.example.data.Group;
 import org.apache.parquet.example.data.simple.SimpleGroup;
@@ -59,12 +60,46 @@ public class SparkModelParser {
                 return loadLogRegModel(ignitePathToMdl);
             case LINEAR_REGRESSION:
                 return loadLinRegModel(ignitePathToMdl);
+            case LINEAR_SVM:
+                return loadLinearSVMModel(ignitePathToMdl);
             default:
                 throw new UnsupportedSparkModelException(ignitePathToMdl);
         }
     }
 
     /**
+     * Load SVM model.
+     *
+     * @param pathToMdl Path to model.
+     */
+    private static Model loadLinearSVMModel(String pathToMdl) {
+        Vector coefficients = null;
+        double interceptor = 0;
+
+        try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
+            PageReadStore pages;
+            final MessageType schema = r.getFooter().getFileMetaData().getSchema();
+            final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
+
+            while (null != (pages = r.readNextRowGroup())) {
+                final long rows = pages.getRowCount();
+                final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
+                for (int i = 0; i < rows; i++) {
+                    final SimpleGroup g = (SimpleGroup)recordReader.read();
+                    interceptor = readSVMInterceptor(g);
+                    coefficients = readSVMCoefficients(g);
+                }
+            }
+        }
+        catch (IOException e) {
+            System.out.println("Error reading parquet file.");
+            e.printStackTrace();
+        }
+
+        return new SVMLinearClassificationModel(coefficients, interceptor);
+    }
+
+    /**
      * Load linear regression model.
      *
      * @param pathToMdl Path to model.
@@ -136,6 +171,36 @@ public class SparkModelParser {
      *
      * @param g Interceptor group.
      */
+    private static double readSVMInterceptor(SimpleGroup g) {
+        return g.getDouble(1, 0);
+    }
+
+    /**
+     * Read coefficient matrix from parquet.
+     *
+     * @param g Coefficient group.
+     * @return Vector of coefficients.
+     */
+    private static Vector readSVMCoefficients(SimpleGroup g) {
+        Vector coefficients;
+        Group coeffGroup = g.getGroup(0, 0).getGroup(3, 0);
+
+        final int amountOfCoefficients = coeffGroup.getFieldRepetitionCount(0);
+
+        coefficients = new DenseVector(amountOfCoefficients);
+
+        for (int j = 0; j < amountOfCoefficients; j++) {
+            double coefficient = coeffGroup.getGroup(0, j).getDouble(0, 0);
+            coefficients.set(j, coefficient);
+        }
+        return coefficients;
+    }
+
+    /**
+     * Read interceptor value from parquet.
+     *
+     * @param g Interceptor group.
+     */
     private static double readLinRegInterceptor(SimpleGroup g) {
         return g.getDouble(0, 0);
     }
diff --git a/modules/ml/spark-model-parser/src/main/java/org/apache/ignite/ml/sparkmodelparser/SupportedSparkModels.java b/modules/ml/spark-model-parser/src/main/java/org/apache/ignite/ml/sparkmodelparser/SupportedSparkModels.java
index 425d1b9..c47ca01 100644
--- a/modules/ml/spark-model-parser/src/main/java/org/apache/ignite/ml/sparkmodelparser/SupportedSparkModels.java
+++ b/modules/ml/spark-model-parser/src/main/java/org/apache/ignite/ml/sparkmodelparser/SupportedSparkModels.java
@@ -27,5 +27,8 @@ public enum SupportedSparkModels {
     LOG_REGRESSION,
 
     /** Linear regression. */
-    LINEAR_REGRESSION
+    LINEAR_REGRESSION,
+
+    /** Support Vector Machine . */
+    LINEAR_SVM
 }