You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2019/02/25 10:38:25 UTC
[spark] branch master updated: [SPARK-26966][ML] Update to JPMML
1.4.8
This is an automated email from the ASF dual-hosted git repository.
srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new d252978 [SPARK-26966][ML] Update to JPMML 1.4.8
d252978 is described below
commit d2529788ed07980ab2ef2472138cb16eb32a15e8
Author: Sean Owen <se...@databricks.com>
AuthorDate: Mon Feb 25 04:37:45 2019 -0600
[SPARK-26966][ML] Update to JPMML 1.4.8
## What changes were proposed in this pull request?
JPMML apparently only supports Java 9 in 1.4.2+. We are seeing text failures from JPMML relating to JAXB when running on Java 11. It's shaded and not a big change, so should be safe.
## How was this patch tested?
Existing tests.
Closes #23868 from srowen/SPARK-26966.
Authored-by: Sean Owen <se...@databricks.com>
Signed-off-by: Sean Owen <se...@databricks.com>
---
.../export/BinaryClassificationPMMLModelExport.scala | 14 ++++++++------
.../pmml/export/GeneralizedLinearPMMLModelExport.scala | 10 ++++++----
.../spark/mllib/pmml/export/KMeansPMMLModelExport.scala | 10 ++++++----
.../spark/mllib/pmml/export/PMMLModelExportFactory.scala | 6 +++---
.../org/apache/spark/ml/clustering/KMeansSuite.scala | 3 ++-
.../spark/ml/regression/LinearRegressionSuite.scala | 4 ++--
.../test/scala/org/apache/spark/ml/util/PMMLUtils.scala | 16 ++++++++--------
.../BinaryClassificationPMMLModelExportSuite.scala | 9 +++++----
.../export/GeneralizedLinearPMMLModelExportSuite.scala | 2 +-
.../mllib/pmml/export/KMeansPMMLModelExportSuite.scala | 2 +-
pom.xml | 2 +-
11 files changed, 43 insertions(+), 35 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala
index a8c32f7..27935c6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala
@@ -19,7 +19,9 @@ package org.apache.spark.mllib.pmml.export
import scala.{Array => SArray}
-import org.dmg.pmml._
+import org.dmg.pmml.{DataDictionary, DataField, DataType, FieldName, MiningField,
+ MiningFunction, MiningSchema, OpType}
+import org.dmg.pmml.regression.{NumericPredictor, RegressionModel, RegressionTable}
import org.apache.spark.mllib.regression.GeneralizedLinearModel
@@ -29,7 +31,7 @@ import org.apache.spark.mllib.regression.GeneralizedLinearModel
private[mllib] class BinaryClassificationPMMLModelExport(
model: GeneralizedLinearModel,
description: String,
- normalizationMethod: RegressionNormalizationMethodType,
+ normalizationMethod: RegressionModel.NormalizationMethod,
threshold: Double)
extends PMMLModelExport {
@@ -47,7 +49,7 @@ private[mllib] class BinaryClassificationPMMLModelExport(
val miningSchema = new MiningSchema
val regressionTableYES = new RegressionTable(model.intercept).setTargetCategory("1")
var interceptNO = threshold
- if (RegressionNormalizationMethodType.LOGIT == normalizationMethod) {
+ if (RegressionModel.NormalizationMethod.LOGIT == normalizationMethod) {
if (threshold <= 0) {
interceptNO = Double.MinValue
} else if (threshold >= 1) {
@@ -58,7 +60,7 @@ private[mllib] class BinaryClassificationPMMLModelExport(
}
val regressionTableNO = new RegressionTable(interceptNO).setTargetCategory("0")
val regressionModel = new RegressionModel()
- .setFunctionName(MiningFunctionType.CLASSIFICATION)
+ .setMiningFunction(MiningFunction.CLASSIFICATION)
.setMiningSchema(miningSchema)
.setModelName(description)
.setNormalizationMethod(normalizationMethod)
@@ -69,7 +71,7 @@ private[mllib] class BinaryClassificationPMMLModelExport(
dataDictionary.addDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE))
miningSchema
.addMiningFields(new MiningField(fields(i))
- .setUsageType(FieldUsageType.ACTIVE))
+ .setUsageType(MiningField.UsageType.ACTIVE))
regressionTableYES.addNumericPredictors(new NumericPredictor(fields(i), model.weights(i)))
}
@@ -79,7 +81,7 @@ private[mllib] class BinaryClassificationPMMLModelExport(
.addDataFields(new DataField(targetField, OpType.CATEGORICAL, DataType.STRING))
miningSchema
.addMiningFields(new MiningField(targetField)
- .setUsageType(FieldUsageType.TARGET))
+ .setUsageType(MiningField.UsageType.TARGET))
dataDictionary.setNumberOfFields(dataDictionary.getDataFields.size)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExport.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExport.scala
index 4d951d2..723224d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExport.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExport.scala
@@ -19,7 +19,9 @@ package org.apache.spark.mllib.pmml.export
import scala.{Array => SArray}
-import org.dmg.pmml._
+import org.dmg.pmml.{DataDictionary, DataField, DataType, FieldName, MiningField,
+ MiningFunction, MiningSchema, OpType}
+import org.dmg.pmml.regression.{NumericPredictor, RegressionModel, RegressionTable}
import org.apache.spark.mllib.regression.GeneralizedLinearModel
@@ -45,7 +47,7 @@ private[mllib] class GeneralizedLinearPMMLModelExport(
val miningSchema = new MiningSchema
val regressionTable = new RegressionTable(model.intercept)
val regressionModel = new RegressionModel()
- .setFunctionName(MiningFunctionType.REGRESSION)
+ .setMiningFunction(MiningFunction.REGRESSION)
.setMiningSchema(miningSchema)
.setModelName(description)
.addRegressionTables(regressionTable)
@@ -55,7 +57,7 @@ private[mllib] class GeneralizedLinearPMMLModelExport(
dataDictionary.addDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE))
miningSchema
.addMiningFields(new MiningField(fields(i))
- .setUsageType(FieldUsageType.ACTIVE))
+ .setUsageType(MiningField.UsageType.ACTIVE))
regressionTable.addNumericPredictors(new NumericPredictor(fields(i), model.weights(i)))
}
@@ -64,7 +66,7 @@ private[mllib] class GeneralizedLinearPMMLModelExport(
dataDictionary.addDataFields(new DataField(targetField, OpType.CONTINUOUS, DataType.DOUBLE))
miningSchema
.addMiningFields(new MiningField(targetField)
- .setUsageType(FieldUsageType.TARGET))
+ .setUsageType(MiningField.UsageType.TARGET))
dataDictionary.setNumberOfFields(dataDictionary.getDataFields.size)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExport.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExport.scala
index 255c614..d043c9e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExport.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExport.scala
@@ -19,7 +19,9 @@ package org.apache.spark.mllib.pmml.export
import scala.{Array => SArray}
-import org.dmg.pmml._
+import org.dmg.pmml.{Array, CompareFunction, ComparisonMeasure, DataDictionary, DataField, DataType,
+ FieldName, MiningField, MiningFunction, MiningSchema, OpType, SquaredEuclidean}
+import org.dmg.pmml.clustering.{Cluster, ClusteringField, ClusteringModel}
import org.apache.spark.mllib.clustering.KMeansModel
@@ -48,7 +50,7 @@ private[mllib] class KMeansPMMLModelExport(model: KMeansModel) extends PMMLModel
.setModelName("k-means")
.setMiningSchema(miningSchema)
.setComparisonMeasure(comparisonMeasure)
- .setFunctionName(MiningFunctionType.CLUSTERING)
+ .setMiningFunction(MiningFunction.CLUSTERING)
.setModelClass(ClusteringModel.ModelClass.CENTER_BASED)
.setNumberOfClusters(model.clusterCenters.length)
@@ -57,9 +59,9 @@ private[mllib] class KMeansPMMLModelExport(model: KMeansModel) extends PMMLModel
dataDictionary.addDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE))
miningSchema
.addMiningFields(new MiningField(fields(i))
- .setUsageType(FieldUsageType.ACTIVE))
+ .setUsageType(MiningField.UsageType.ACTIVE))
clusteringModel.addClusteringFields(
- new ClusteringField(fields(i)).setCompareFunction(CompareFunctionType.ABS_DIFF))
+ new ClusteringField(fields(i)).setCompareFunction(CompareFunction.ABS_DIFF))
}
dataDictionary.setNumberOfFields(dataDictionary.getDataFields.size)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactory.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactory.scala
index 29bd689..84e6304 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactory.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactory.scala
@@ -17,7 +17,7 @@
package org.apache.spark.mllib.pmml.export
-import org.dmg.pmml.RegressionNormalizationMethodType
+import org.dmg.pmml.regression.RegressionModel
import org.apache.spark.mllib.classification.LogisticRegressionModel
import org.apache.spark.mllib.classification.SVMModel
@@ -44,12 +44,12 @@ private[mllib] object PMMLModelExportFactory {
new GeneralizedLinearPMMLModelExport(lasso, "lasso regression")
case svm: SVMModel =>
new BinaryClassificationPMMLModelExport(
- svm, "linear SVM", RegressionNormalizationMethodType.NONE,
+ svm, "linear SVM", RegressionModel.NormalizationMethod.NONE,
svm.getThreshold.getOrElse(0.0))
case logistic: LogisticRegressionModel =>
if (logistic.numClasses == 2) {
new BinaryClassificationPMMLModelExport(
- logistic, "logistic regression", RegressionNormalizationMethodType.LOGIT,
+ logistic, "logistic regression", RegressionModel.NormalizationMethod.LOGIT,
logistic.getThreshold.getOrElse(0.5))
} else {
throw new IllegalArgumentException(
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
index b377582..a5159bc 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
@@ -20,7 +20,8 @@ package org.apache.spark.ml.clustering
import scala.language.existentials
import scala.util.Random
-import org.dmg.pmml.{ClusteringModel, PMML}
+import org.dmg.pmml.PMML
+import org.dmg.pmml.clustering.ClusteringModel
import org.apache.spark.SparkException
import org.apache.spark.ml.linalg.{Vector, Vectors}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 7653289..c4db336 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -18,10 +18,10 @@
package org.apache.spark.ml.regression
import scala.collection.JavaConverters._
-import scala.collection.mutable
import scala.util.Random
-import org.dmg.pmml.{OpType, PMML, RegressionModel => PMMLRegressionModel}
+import org.dmg.pmml.{OpType, PMML}
+import org.dmg.pmml.regression.{RegressionModel => PMMLRegressionModel}
import org.apache.spark.ml.feature.Instance
import org.apache.spark.ml.feature.LabeledPoint
diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/PMMLUtils.scala b/mllib/src/test/scala/org/apache/spark/ml/util/PMMLUtils.scala
index dbdc69f..620c754 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/util/PMMLUtils.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/util/PMMLUtils.scala
@@ -16,13 +16,12 @@
*/
package org.apache.spark.ml.util
-import java.io.StringReader
-import javax.xml.bind.Unmarshaller
-import javax.xml.transform.Source
+import java.io.ByteArrayInputStream
+import java.nio.charset.StandardCharsets
-import org.dmg.pmml._
-import org.jpmml.model.{ImportFilter, JAXBUtil}
-import org.xml.sax.InputSource
+import org.dmg.pmml.PMML
+import org.jpmml.model.{JAXBUtil, SAXUtil}
+import org.jpmml.model.filters.ImportFilter
/**
* Testing utils for working with PMML.
@@ -36,8 +35,9 @@ private[spark] object PMMLUtils {
* through external spark-packages.
*/
def loadFromString(input: String): PMML = {
- val is = new StringReader(input)
- val transformed = ImportFilter.apply(new InputSource(is))
+ val transformed = SAXUtil.createFilteredSource(
+ new ByteArrayInputStream(input.getBytes(StandardCharsets.UTF_8)),
+ new ImportFilter())
JAXBUtil.unmarshalPMML(transformed)
}
}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala
index 4c6e76e..08c581c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala
@@ -17,8 +17,7 @@
package org.apache.spark.mllib.pmml.export
-import org.dmg.pmml.RegressionModel
-import org.dmg.pmml.RegressionNormalizationMethodType
+import org.dmg.pmml.regression.RegressionModel
import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.classification.LogisticRegressionModel
@@ -51,7 +50,8 @@ class BinaryClassificationPMMLModelExportSuite extends SparkFunSuite {
assert(pmmlRegressionModel.getRegressionTables.get(1).getTargetCategory === "0")
assert(pmmlRegressionModel.getRegressionTables.get(1).getNumericPredictors.size === 0)
// ensure logistic regression has normalization method set to LOGIT
- assert(pmmlRegressionModel.getNormalizationMethod() == RegressionNormalizationMethodType.LOGIT)
+ assert(pmmlRegressionModel.getNormalizationMethod() ===
+ RegressionModel.NormalizationMethod.LOGIT)
}
test("linear SVM PMML export") {
@@ -78,7 +78,8 @@ class BinaryClassificationPMMLModelExportSuite extends SparkFunSuite {
assert(pmmlRegressionModel.getRegressionTables.get(1).getTargetCategory === "0")
assert(pmmlRegressionModel.getRegressionTables.get(1).getNumericPredictors.size === 0)
// ensure linear SVM has normalization method set to NONE
- assert(pmmlRegressionModel.getNormalizationMethod() == RegressionNormalizationMethodType.NONE)
+ assert(pmmlRegressionModel.getNormalizationMethod() ===
+ RegressionModel.NormalizationMethod.NONE)
}
}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExportSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExportSuite.scala
index 1d32309..bf1a0fd 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExportSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExportSuite.scala
@@ -17,7 +17,7 @@
package org.apache.spark.mllib.pmml.export
-import org.dmg.pmml.RegressionModel
+import org.dmg.pmml.regression.RegressionModel
import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala
index b3f9750..b61c622 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala
@@ -17,7 +17,7 @@
package org.apache.spark.mllib.pmml.export
-import org.dmg.pmml.ClusteringModel
+import org.dmg.pmml.clustering.ClusteringModel
import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.clustering.KMeansModel
diff --git a/pom.xml b/pom.xml
index 746541b..f06afba 100644
--- a/pom.xml
+++ b/pom.xml
@@ -398,7 +398,7 @@
<dependency>
<groupId>org.jpmml</groupId>
<artifactId>pmml-model</artifactId>
- <version>1.2.15</version>
+ <version>1.4.8</version>
<scope>provided</scope>
<exclusions>
<exclusion>
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org