You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2020/03/07 17:45:08 UTC
[spark] branch branch-3.0 updated: [SPARK-31012][ML][PYSPARK][DOCS]
Updating ML API docs for 3.0 changes
This is an automated email from the ASF dual-hosted git repository.
srowen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new d73ea97 [SPARK-31012][ML][PYSPARK][DOCS] Updating ML API docs for 3.0 changes
d73ea97 is described below
commit d73ea9791bff2c383b10d731cf651b2ebe159dde
Author: Huaxin Gao <hu...@us.ibm.com>
AuthorDate: Sat Mar 7 11:42:05 2020 -0600
[SPARK-31012][ML][PYSPARK][DOCS] Updating ML API docs for 3.0 changes
### What changes were proposed in this pull request?
Updating ML docs for 3.0 changes
### Why are the changes needed?
I am auditing 3.0 ML changes, found some docs are missing or not updated. Need to update these.
### Does this PR introduce any user-facing change?
Yes, doc changes
### How was this patch tested?
Manually build and check
Closes #27762 from huaxingao/spark-doc.
Authored-by: Huaxin Gao <hu...@us.ibm.com>
Signed-off-by: Sean Owen <sr...@gmail.com>
(cherry picked from commit 4a64901ab7176ad3d41c4598dea1e73cd92e969e)
Signed-off-by: Sean Owen <sr...@gmail.com>
---
docs/ml-features.md | 2 +-
.../evaluation/BinaryClassificationEvaluator.scala | 3 ++-
.../MulticlassClassificationEvaluator.scala | 21 +++++++++++++++++++++
.../spark/ml/evaluation/RankingEvaluator.scala | 5 +++++
.../spark/ml/evaluation/RegressionEvaluator.scala | 3 ++-
python/pyspark/ml/evaluation.py | 9 +++++----
python/pyspark/ml/feature.py | 2 +-
7 files changed, 37 insertions(+), 8 deletions(-)
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 9c05fd5..05ef848 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1499,7 +1499,7 @@ for more details on the API.
The `Imputer` estimator completes missing values in a dataset, either using the mean or the
median of the columns in which the missing values are located. The input columns should be of
-`DoubleType` or `FloatType`. Currently `Imputer` does not support categorical features and possibly
+numeric type. Currently `Imputer` does not support categorical features and possibly
creates incorrect values for columns containing categorical features. Imputer can impute custom values
other than 'NaN' by `.setMissingValue(custom_value)`. For example, `.setMissingValue(0)` will impute
all occurrences of (0).
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
index 55b910e..7733225 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -28,7 +28,8 @@ import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.DoubleType
/**
- * Evaluator for binary classification, which expects two input columns: rawPrediction and label.
+ * Evaluator for binary classification, which expects input columns rawPrediction, label and
+ * an optional weight column.
* The rawPrediction column can be of type double (binary 0/1 prediction, or probability of label 1)
* or of type vector (length-2 vector of raw predictions, scores, or label probabilities).
*/
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala
index 4357081..1d6540e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala
@@ -81,6 +81,14 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid
@Since("3.0.0")
def setProbabilityCol(value: String): this.type = set(probabilityCol, value)
+ /**
+ * The class whose metric will be computed in `"truePositiveRateByLabel"`,
+ * `"falsePositiveRateByLabel"`, `"precisionByLabel"`, `"recallByLabel"`,
+ * `"fMeasureByLabel"`.
+ * Must be greater than or equal to 0. The default value is 0.
+ *
+ * @group param
+ */
@Since("3.0.0")
final val metricLabel: DoubleParam = new DoubleParam(this, "metricLabel",
"The class whose metric will be computed in " +
@@ -98,6 +106,13 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid
setDefault(metricLabel -> 0.0)
+ /**
+ * The beta value, which controls precision vs recall weighting,
+ * used in `"weightedFMeasure"`, `"fMeasureByLabel"`.
+ * Must be greater than 0. The default value is 1.
+ *
+ * @group param
+ */
@Since("3.0.0")
final val beta: DoubleParam = new DoubleParam(this, "beta",
"The beta value, which controls precision vs recall weighting, " +
@@ -114,6 +129,12 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid
setDefault(beta -> 1.0)
+ /**
+ * param for eps. log-loss is undefined for p=0 or p=1, so probabilities are clipped to
+ * max(eps, min(1 - eps, p)). Must be in range (0, 0.5). The default value is 1e-15.
+ *
+ * @group param
+ */
@Since("3.0.0")
final val eps: DoubleParam = new DoubleParam(this, "eps",
"log-loss is undefined for p=0 or p=1, so probabilities are clipped to " +
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RankingEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RankingEvaluator.scala
index ca3a8eb..8d017eb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RankingEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RankingEvaluator.scala
@@ -59,6 +59,11 @@ class RankingEvaluator (override val uid: String)
setDefault(metricName -> "meanAveragePrecision")
+ /**
+ * param for ranking position value used in `"meanAveragePrecisionAtK"`, `"precisionAtK"`,
+ * `"ndcgAtK"`, `"recallAtK"`. Must be > 0. The default value is 10.
+ * @group param
+ */
final val k = new IntParam(this, "k",
"The ranking position value used in " +
s"${supportedMetricNames.filter(_.endsWith("AtK")).mkString("(", "|", ")")} " +
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
index 9f32d40..18a8dda 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
@@ -27,7 +27,8 @@ import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DoubleType, FloatType}
/**
- * Evaluator for regression, which expects two input columns: prediction and label.
+ * Evaluator for regression, which expects input columns prediction, label and
+ * an optional weight column.
*/
@Since("1.4.0")
final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val uid: String)
diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index 556a2f8..265f02c 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -110,7 +110,8 @@ class JavaEvaluator(JavaParams, Evaluator):
class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPredictionCol, HasWeightCol,
JavaMLReadable, JavaMLWritable):
"""
- Evaluator for binary classification, which expects two input columns: rawPrediction and label.
+ Evaluator for binary classification, which expects input columns rawPrediction, label
+ and an optional weight column.
The rawPrediction column can be of type double (binary 0/1 prediction, or probability of label
1) or of type vector (length-2 vector of raw predictions, scores, or label probabilities).
@@ -409,9 +410,9 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio
"""
metricName = Param(Params._dummy(), "metricName",
"metric name in evaluation "
- "(f1|accuracy|weightedPrecision|weightedRecall|weightedTruePositiveRate|"
- "weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel|"
- "falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel|"
+ "(f1|accuracy|weightedPrecision|weightedRecall|weightedTruePositiveRate| "
+ "weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel| "
+ "falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel| "
"logLoss|hammingLoss)",
typeConverter=TypeConverters.toString)
metricLabel = Param(Params._dummy(), "metricLabel",
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 4c25bb4..6df2f74 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -1467,7 +1467,7 @@ class Imputer(JavaEstimator, _ImputerParams, JavaMLReadable, JavaMLWritable):
"""
Imputation estimator for completing missing values, either using the mean or the median
of the columns in which the missing values are located. The input columns should be of
- DoubleType or FloatType. Currently Imputer does not support categorical features and
+ numeric type. Currently Imputer does not support categorical features and
possibly creates incorrect values for a categorical feature.
Note that the mean/median value is computed after filtering out missing values.
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org