You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2019/02/25 10:38:25 UTC

[spark] branch master updated: [SPARK-26966][ML] Update to JPMML 1.4.8

This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new d252978  [SPARK-26966][ML] Update to JPMML 1.4.8
d252978 is described below

commit d2529788ed07980ab2ef2472138cb16eb32a15e8
Author: Sean Owen <se...@databricks.com>
AuthorDate: Mon Feb 25 04:37:45 2019 -0600

    [SPARK-26966][ML] Update to JPMML 1.4.8
    
    ## What changes were proposed in this pull request?
    
    JPMML apparently only supports Java 9 in 1.4.2+. We are seeing text failures from JPMML relating to JAXB when running on Java 11. It's shaded and not a big change, so should be safe.
    
    ## How was this patch tested?
    
    Existing tests.
    
    Closes #23868 from srowen/SPARK-26966.
    
    Authored-by: Sean Owen <se...@databricks.com>
    Signed-off-by: Sean Owen <se...@databricks.com>
---
 .../export/BinaryClassificationPMMLModelExport.scala     | 14 ++++++++------
 .../pmml/export/GeneralizedLinearPMMLModelExport.scala   | 10 ++++++----
 .../spark/mllib/pmml/export/KMeansPMMLModelExport.scala  | 10 ++++++----
 .../spark/mllib/pmml/export/PMMLModelExportFactory.scala |  6 +++---
 .../org/apache/spark/ml/clustering/KMeansSuite.scala     |  3 ++-
 .../spark/ml/regression/LinearRegressionSuite.scala      |  4 ++--
 .../test/scala/org/apache/spark/ml/util/PMMLUtils.scala  | 16 ++++++++--------
 .../BinaryClassificationPMMLModelExportSuite.scala       |  9 +++++----
 .../export/GeneralizedLinearPMMLModelExportSuite.scala   |  2 +-
 .../mllib/pmml/export/KMeansPMMLModelExportSuite.scala   |  2 +-
 pom.xml                                                  |  2 +-
 11 files changed, 43 insertions(+), 35 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala
index a8c32f7..27935c6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala
@@ -19,7 +19,9 @@ package org.apache.spark.mllib.pmml.export
 
 import scala.{Array => SArray}
 
-import org.dmg.pmml._
+import org.dmg.pmml.{DataDictionary, DataField, DataType, FieldName, MiningField,
+  MiningFunction, MiningSchema, OpType}
+import org.dmg.pmml.regression.{NumericPredictor, RegressionModel, RegressionTable}
 
 import org.apache.spark.mllib.regression.GeneralizedLinearModel
 
@@ -29,7 +31,7 @@ import org.apache.spark.mllib.regression.GeneralizedLinearModel
 private[mllib] class BinaryClassificationPMMLModelExport(
     model: GeneralizedLinearModel,
     description: String,
-    normalizationMethod: RegressionNormalizationMethodType,
+    normalizationMethod: RegressionModel.NormalizationMethod,
     threshold: Double)
   extends PMMLModelExport {
 
@@ -47,7 +49,7 @@ private[mllib] class BinaryClassificationPMMLModelExport(
        val miningSchema = new MiningSchema
        val regressionTableYES = new RegressionTable(model.intercept).setTargetCategory("1")
        var interceptNO = threshold
-       if (RegressionNormalizationMethodType.LOGIT == normalizationMethod) {
+       if (RegressionModel.NormalizationMethod.LOGIT == normalizationMethod) {
          if (threshold <= 0) {
            interceptNO = Double.MinValue
          } else if (threshold >= 1) {
@@ -58,7 +60,7 @@ private[mllib] class BinaryClassificationPMMLModelExport(
        }
        val regressionTableNO = new RegressionTable(interceptNO).setTargetCategory("0")
        val regressionModel = new RegressionModel()
-         .setFunctionName(MiningFunctionType.CLASSIFICATION)
+         .setMiningFunction(MiningFunction.CLASSIFICATION)
          .setMiningSchema(miningSchema)
          .setModelName(description)
          .setNormalizationMethod(normalizationMethod)
@@ -69,7 +71,7 @@ private[mllib] class BinaryClassificationPMMLModelExport(
          dataDictionary.addDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE))
          miningSchema
            .addMiningFields(new MiningField(fields(i))
-           .setUsageType(FieldUsageType.ACTIVE))
+           .setUsageType(MiningField.UsageType.ACTIVE))
          regressionTableYES.addNumericPredictors(new NumericPredictor(fields(i), model.weights(i)))
        }
 
@@ -79,7 +81,7 @@ private[mllib] class BinaryClassificationPMMLModelExport(
          .addDataFields(new DataField(targetField, OpType.CATEGORICAL, DataType.STRING))
        miningSchema
          .addMiningFields(new MiningField(targetField)
-         .setUsageType(FieldUsageType.TARGET))
+         .setUsageType(MiningField.UsageType.TARGET))
 
        dataDictionary.setNumberOfFields(dataDictionary.getDataFields.size)
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExport.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExport.scala
index 4d951d2..723224d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExport.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExport.scala
@@ -19,7 +19,9 @@ package org.apache.spark.mllib.pmml.export
 
 import scala.{Array => SArray}
 
-import org.dmg.pmml._
+import org.dmg.pmml.{DataDictionary, DataField, DataType, FieldName, MiningField,
+  MiningFunction, MiningSchema, OpType}
+import org.dmg.pmml.regression.{NumericPredictor, RegressionModel, RegressionTable}
 
 import org.apache.spark.mllib.regression.GeneralizedLinearModel
 
@@ -45,7 +47,7 @@ private[mllib] class GeneralizedLinearPMMLModelExport(
       val miningSchema = new MiningSchema
       val regressionTable = new RegressionTable(model.intercept)
       val regressionModel = new RegressionModel()
-        .setFunctionName(MiningFunctionType.REGRESSION)
+        .setMiningFunction(MiningFunction.REGRESSION)
         .setMiningSchema(miningSchema)
         .setModelName(description)
         .addRegressionTables(regressionTable)
@@ -55,7 +57,7 @@ private[mllib] class GeneralizedLinearPMMLModelExport(
         dataDictionary.addDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE))
         miningSchema
           .addMiningFields(new MiningField(fields(i))
-          .setUsageType(FieldUsageType.ACTIVE))
+          .setUsageType(MiningField.UsageType.ACTIVE))
         regressionTable.addNumericPredictors(new NumericPredictor(fields(i), model.weights(i)))
       }
 
@@ -64,7 +66,7 @@ private[mllib] class GeneralizedLinearPMMLModelExport(
       dataDictionary.addDataFields(new DataField(targetField, OpType.CONTINUOUS, DataType.DOUBLE))
       miningSchema
         .addMiningFields(new MiningField(targetField)
-        .setUsageType(FieldUsageType.TARGET))
+        .setUsageType(MiningField.UsageType.TARGET))
 
       dataDictionary.setNumberOfFields(dataDictionary.getDataFields.size)
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExport.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExport.scala
index 255c614..d043c9e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExport.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExport.scala
@@ -19,7 +19,9 @@ package org.apache.spark.mllib.pmml.export
 
 import scala.{Array => SArray}
 
-import org.dmg.pmml._
+import org.dmg.pmml.{Array, CompareFunction, ComparisonMeasure, DataDictionary, DataField, DataType,
+  FieldName, MiningField, MiningFunction, MiningSchema, OpType, SquaredEuclidean}
+import org.dmg.pmml.clustering.{Cluster, ClusteringField, ClusteringModel}
 
 import org.apache.spark.mllib.clustering.KMeansModel
 
@@ -48,7 +50,7 @@ private[mllib] class KMeansPMMLModelExport(model: KMeansModel) extends PMMLModel
         .setModelName("k-means")
         .setMiningSchema(miningSchema)
         .setComparisonMeasure(comparisonMeasure)
-        .setFunctionName(MiningFunctionType.CLUSTERING)
+        .setMiningFunction(MiningFunction.CLUSTERING)
         .setModelClass(ClusteringModel.ModelClass.CENTER_BASED)
         .setNumberOfClusters(model.clusterCenters.length)
 
@@ -57,9 +59,9 @@ private[mllib] class KMeansPMMLModelExport(model: KMeansModel) extends PMMLModel
         dataDictionary.addDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE))
         miningSchema
           .addMiningFields(new MiningField(fields(i))
-          .setUsageType(FieldUsageType.ACTIVE))
+          .setUsageType(MiningField.UsageType.ACTIVE))
         clusteringModel.addClusteringFields(
-          new ClusteringField(fields(i)).setCompareFunction(CompareFunctionType.ABS_DIFF))
+          new ClusteringField(fields(i)).setCompareFunction(CompareFunction.ABS_DIFF))
       }
 
       dataDictionary.setNumberOfFields(dataDictionary.getDataFields.size)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactory.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactory.scala
index 29bd689..84e6304 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactory.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactory.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.pmml.export
 
-import org.dmg.pmml.RegressionNormalizationMethodType
+import org.dmg.pmml.regression.RegressionModel
 
 import org.apache.spark.mllib.classification.LogisticRegressionModel
 import org.apache.spark.mllib.classification.SVMModel
@@ -44,12 +44,12 @@ private[mllib] object PMMLModelExportFactory {
         new GeneralizedLinearPMMLModelExport(lasso, "lasso regression")
       case svm: SVMModel =>
         new BinaryClassificationPMMLModelExport(
-          svm, "linear SVM", RegressionNormalizationMethodType.NONE,
+          svm, "linear SVM", RegressionModel.NormalizationMethod.NONE,
           svm.getThreshold.getOrElse(0.0))
       case logistic: LogisticRegressionModel =>
         if (logistic.numClasses == 2) {
           new BinaryClassificationPMMLModelExport(
-            logistic, "logistic regression", RegressionNormalizationMethodType.LOGIT,
+            logistic, "logistic regression", RegressionModel.NormalizationMethod.LOGIT,
             logistic.getThreshold.getOrElse(0.5))
         } else {
           throw new IllegalArgumentException(
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
index b377582..a5159bc 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
@@ -20,7 +20,8 @@ package org.apache.spark.ml.clustering
 import scala.language.existentials
 import scala.util.Random
 
-import org.dmg.pmml.{ClusteringModel, PMML}
+import org.dmg.pmml.PMML
+import org.dmg.pmml.clustering.ClusteringModel
 
 import org.apache.spark.SparkException
 import org.apache.spark.ml.linalg.{Vector, Vectors}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 7653289..c4db336 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -18,10 +18,10 @@
 package org.apache.spark.ml.regression
 
 import scala.collection.JavaConverters._
-import scala.collection.mutable
 import scala.util.Random
 
-import org.dmg.pmml.{OpType, PMML, RegressionModel => PMMLRegressionModel}
+import org.dmg.pmml.{OpType, PMML}
+import org.dmg.pmml.regression.{RegressionModel => PMMLRegressionModel}
 
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.feature.LabeledPoint
diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/PMMLUtils.scala b/mllib/src/test/scala/org/apache/spark/ml/util/PMMLUtils.scala
index dbdc69f..620c754 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/util/PMMLUtils.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/util/PMMLUtils.scala
@@ -16,13 +16,12 @@
  */
 package org.apache.spark.ml.util
 
-import java.io.StringReader
-import javax.xml.bind.Unmarshaller
-import javax.xml.transform.Source
+import java.io.ByteArrayInputStream
+import java.nio.charset.StandardCharsets
 
-import org.dmg.pmml._
-import org.jpmml.model.{ImportFilter, JAXBUtil}
-import org.xml.sax.InputSource
+import org.dmg.pmml.PMML
+import org.jpmml.model.{JAXBUtil, SAXUtil}
+import org.jpmml.model.filters.ImportFilter
 
 /**
  * Testing utils for working with PMML.
@@ -36,8 +35,9 @@ private[spark] object PMMLUtils {
    * through external spark-packages.
    */
   def loadFromString(input: String): PMML = {
-    val is = new StringReader(input)
-    val transformed = ImportFilter.apply(new InputSource(is))
+    val transformed = SAXUtil.createFilteredSource(
+      new ByteArrayInputStream(input.getBytes(StandardCharsets.UTF_8)),
+      new ImportFilter())
     JAXBUtil.unmarshalPMML(transformed)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala
index 4c6e76e..08c581c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.mllib.pmml.export
 
-import org.dmg.pmml.RegressionModel
-import org.dmg.pmml.RegressionNormalizationMethodType
+import org.dmg.pmml.regression.RegressionModel
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.classification.LogisticRegressionModel
@@ -51,7 +50,8 @@ class BinaryClassificationPMMLModelExportSuite extends SparkFunSuite {
     assert(pmmlRegressionModel.getRegressionTables.get(1).getTargetCategory === "0")
     assert(pmmlRegressionModel.getRegressionTables.get(1).getNumericPredictors.size === 0)
     // ensure logistic regression has normalization method set to LOGIT
-    assert(pmmlRegressionModel.getNormalizationMethod() == RegressionNormalizationMethodType.LOGIT)
+    assert(pmmlRegressionModel.getNormalizationMethod() ===
+      RegressionModel.NormalizationMethod.LOGIT)
   }
 
   test("linear SVM PMML export") {
@@ -78,7 +78,8 @@ class BinaryClassificationPMMLModelExportSuite extends SparkFunSuite {
     assert(pmmlRegressionModel.getRegressionTables.get(1).getTargetCategory === "0")
     assert(pmmlRegressionModel.getRegressionTables.get(1).getNumericPredictors.size === 0)
     // ensure linear SVM has normalization method set to NONE
-    assert(pmmlRegressionModel.getNormalizationMethod() == RegressionNormalizationMethodType.NONE)
+    assert(pmmlRegressionModel.getNormalizationMethod() ===
+      RegressionModel.NormalizationMethod.NONE)
   }
 
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExportSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExportSuite.scala
index 1d32309..bf1a0fd 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExportSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExportSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.pmml.export
 
-import org.dmg.pmml.RegressionModel
+import org.dmg.pmml.regression.RegressionModel
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala
index b3f9750..b61c622 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.pmml.export
 
-import org.dmg.pmml.ClusteringModel
+import org.dmg.pmml.clustering.ClusteringModel
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.clustering.KMeansModel
diff --git a/pom.xml b/pom.xml
index 746541b..f06afba 100644
--- a/pom.xml
+++ b/pom.xml
@@ -398,7 +398,7 @@
       <dependency>
         <groupId>org.jpmml</groupId>
         <artifactId>pmml-model</artifactId>
-        <version>1.2.15</version>
+        <version>1.4.8</version>
         <scope>provided</scope>
         <exclusions>
           <exclusion>


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org