You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2020/03/07 17:45:08 UTC

[spark] branch branch-3.0 updated: [SPARK-31012][ML][PYSPARK][DOCS] Updating ML API docs for 3.0 changes

This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new d73ea97  [SPARK-31012][ML][PYSPARK][DOCS] Updating ML API docs for 3.0 changes
d73ea97 is described below

commit d73ea9791bff2c383b10d731cf651b2ebe159dde
Author: Huaxin Gao <hu...@us.ibm.com>
AuthorDate: Sat Mar 7 11:42:05 2020 -0600

    [SPARK-31012][ML][PYSPARK][DOCS] Updating ML API docs for 3.0 changes
    
    ### What changes were proposed in this pull request?
    Updating ML docs for 3.0 changes
    
    ### Why are the changes needed?
    I am auditing 3.0 ML changes, found some docs are missing or not updated. Need to update these.
    
    ### Does this PR introduce any user-facing change?
    Yes, doc changes
    
    ### How was this patch tested?
    Manually build and check
    
    Closes #27762 from huaxingao/spark-doc.
    
    Authored-by: Huaxin Gao <hu...@us.ibm.com>
    Signed-off-by: Sean Owen <sr...@gmail.com>
    (cherry picked from commit 4a64901ab7176ad3d41c4598dea1e73cd92e969e)
    Signed-off-by: Sean Owen <sr...@gmail.com>
---
 docs/ml-features.md                                 |  2 +-
 .../evaluation/BinaryClassificationEvaluator.scala  |  3 ++-
 .../MulticlassClassificationEvaluator.scala         | 21 +++++++++++++++++++++
 .../spark/ml/evaluation/RankingEvaluator.scala      |  5 +++++
 .../spark/ml/evaluation/RegressionEvaluator.scala   |  3 ++-
 python/pyspark/ml/evaluation.py                     |  9 +++++----
 python/pyspark/ml/feature.py                        |  2 +-
 7 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 9c05fd5..05ef848 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1499,7 +1499,7 @@ for more details on the API.
 
 The `Imputer` estimator completes missing values in a dataset, either using the mean or the 
 median of the columns in which the missing values are located. The input columns should be of
-`DoubleType` or `FloatType`. Currently `Imputer` does not support categorical features and possibly
+numeric type. Currently `Imputer` does not support categorical features and possibly
 creates incorrect values for columns containing categorical features. Imputer can impute custom values 
 other than 'NaN' by `.setMissingValue(custom_value)`. For example, `.setMissingValue(0)` will impute 
 all occurrences of (0).
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
index 55b910e..7733225 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -28,7 +28,8 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.DoubleType
 
 /**
- * Evaluator for binary classification, which expects two input columns: rawPrediction and label.
+ * Evaluator for binary classification, which expects input columns rawPrediction, label and
+ *  an optional weight column.
  * The rawPrediction column can be of type double (binary 0/1 prediction, or probability of label 1)
  * or of type vector (length-2 vector of raw predictions, scores, or label probabilities).
  */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala
index 4357081..1d6540e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala
@@ -81,6 +81,14 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid
   @Since("3.0.0")
   def setProbabilityCol(value: String): this.type = set(probabilityCol, value)
 
+  /**
+   * The class whose metric will be computed in `"truePositiveRateByLabel"`,
+   * `"falsePositiveRateByLabel"`, `"precisionByLabel"`, `"recallByLabel"`,
+   * `"fMeasureByLabel"`.
+   * Must be greater than or equal to 0. The default value is 0.
+   *
+   * @group param
+   */
   @Since("3.0.0")
   final val metricLabel: DoubleParam = new DoubleParam(this, "metricLabel",
     "The class whose metric will be computed in " +
@@ -98,6 +106,13 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid
 
   setDefault(metricLabel -> 0.0)
 
+  /**
+   * The beta value, which controls precision vs recall weighting,
+   * used in `"weightedFMeasure"`, `"fMeasureByLabel"`.
+   * Must be greater than 0. The default value is 1.
+   *
+   * @group param
+   */
   @Since("3.0.0")
   final val beta: DoubleParam = new DoubleParam(this, "beta",
     "The beta value, which controls precision vs recall weighting, " +
@@ -114,6 +129,12 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid
 
   setDefault(beta -> 1.0)
 
+  /**
+   * param for eps. log-loss is undefined for p=0 or p=1, so probabilities are clipped to
+   * max(eps, min(1 - eps, p)). Must be in range (0, 0.5). The default value is 1e-15.
+   *
+   * @group param
+   */
   @Since("3.0.0")
   final val eps: DoubleParam = new DoubleParam(this, "eps",
     "log-loss is undefined for p=0 or p=1, so probabilities are clipped to " +
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RankingEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RankingEvaluator.scala
index ca3a8eb..8d017eb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RankingEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RankingEvaluator.scala
@@ -59,6 +59,11 @@ class RankingEvaluator (override val uid: String)
 
   setDefault(metricName -> "meanAveragePrecision")
 
+  /**
+   * param for ranking position value used in `"meanAveragePrecisionAtK"`, `"precisionAtK"`,
+   * `"ndcgAtK"`, `"recallAtK"`. Must be &gt; 0. The default value is 10.
+   * @group param
+   */
   final val k = new IntParam(this, "k",
     "The ranking position value used in " +
       s"${supportedMetricNames.filter(_.endsWith("AtK")).mkString("(", "|", ")")}  " +
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
index 9f32d40..18a8dda 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
@@ -27,7 +27,8 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{DoubleType, FloatType}
 
 /**
- * Evaluator for regression, which expects two input columns: prediction and label.
+ * Evaluator for regression, which expects input columns prediction, label and
+ * an optional weight column.
  */
 @Since("1.4.0")
 final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val uid: String)
diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index 556a2f8..265f02c 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -110,7 +110,8 @@ class JavaEvaluator(JavaParams, Evaluator):
 class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPredictionCol, HasWeightCol,
                                     JavaMLReadable, JavaMLWritable):
     """
-    Evaluator for binary classification, which expects two input columns: rawPrediction and label.
+    Evaluator for binary classification, which expects input columns rawPrediction, label
+    and an optional weight column.
     The rawPrediction column can be of type double (binary 0/1 prediction, or probability of label
     1) or of type vector (length-2 vector of raw predictions, scores, or label probabilities).
 
@@ -409,9 +410,9 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio
     """
     metricName = Param(Params._dummy(), "metricName",
                        "metric name in evaluation "
-                       "(f1|accuracy|weightedPrecision|weightedRecall|weightedTruePositiveRate|"
-                       "weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel|"
-                       "falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel|"
+                       "(f1|accuracy|weightedPrecision|weightedRecall|weightedTruePositiveRate| "
+                       "weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel| "
+                       "falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel| "
                        "logLoss|hammingLoss)",
                        typeConverter=TypeConverters.toString)
     metricLabel = Param(Params._dummy(), "metricLabel",
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 4c25bb4..6df2f74 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -1467,7 +1467,7 @@ class Imputer(JavaEstimator, _ImputerParams, JavaMLReadable, JavaMLWritable):
     """
     Imputation estimator for completing missing values, either using the mean or the median
     of the columns in which the missing values are located. The input columns should be of
-    DoubleType or FloatType. Currently Imputer does not support categorical features and
+    numeric type. Currently Imputer does not support categorical features and
     possibly creates incorrect values for a categorical feature.
 
     Note that the mean/median value is computed after filtering out missing values.


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org