You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by yl...@apache.org on 2017/07/20 12:07:24 UTC
spark git commit: [MINOR][ML] Reorg RFormula params.
Repository: spark
Updated Branches:
refs/heads/master 256358f66 -> 5d1850d4b
[MINOR][ML] Reorg RFormula params.
## What changes were proposed in this pull request?
There are mainly two reasons for this reorg:
* Some params are placed in ```RFormulaBase```, while others are placed in ```RFormula```, this is disordered.
* ```RFormulaModel``` should have params ```handleInvalid```, ```formula``` and ```forceIndexLabel```, that users can get invalid values handling policy, formula or whether to force index label if they only have a ```RFormulaModel```. So we need move these params to ```RFormulaBase``` which is also inherited by ```RFormulaModel```.
* ```RFormulaModel``` should support set different ```handleInvalid``` when cross validation.
## How was this patch tested?
Existing tests.
Author: Yanbo Liang <yb...@gmail.com>
Closes #18681 from yanboliang/rformula-reorg.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5d1850d4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5d1850d4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5d1850d4
Branch: refs/heads/master
Commit: 5d1850d4b541a8108c934a174097f3c7e10b5315
Parents: 256358f
Author: Yanbo Liang <yb...@gmail.com>
Authored: Thu Jul 20 20:07:16 2017 +0800
Committer: Yanbo Liang <yb...@gmail.com>
Committed: Thu Jul 20 20:07:16 2017 +0800
----------------------------------------------------------------------
.../org/apache/spark/ml/feature/RFormula.scala | 95 ++++++++++----------
1 file changed, 47 insertions(+), 48 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/5d1850d4/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index c224454..7da3339 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -35,7 +35,51 @@ import org.apache.spark.sql.types._
/**
* Base trait for [[RFormula]] and [[RFormulaModel]].
*/
-private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
+private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol with HasHandleInvalid {
+
+ /**
+ * R formula parameter. The formula is provided in string form.
+ * @group param
+ */
+ @Since("1.5.0")
+ val formula: Param[String] = new Param(this, "formula", "R model formula")
+
+ /** @group getParam */
+ @Since("1.5.0")
+ def getFormula: String = $(formula)
+
+ /**
+ * Force to index label whether it is numeric or string type.
+ * Usually we index label only when it is string type.
+ * If the formula was used by classification algorithms,
+ * we can force to index label even it is numeric type by setting this param with true.
+ * Default: false.
+ * @group param
+ */
+ @Since("2.1.0")
+ val forceIndexLabel: BooleanParam = new BooleanParam(this, "forceIndexLabel",
+ "Force to index label whether it is numeric or string")
+ setDefault(forceIndexLabel -> false)
+
+ /** @group getParam */
+ @Since("2.1.0")
+ def getForceIndexLabel: Boolean = $(forceIndexLabel)
+
+ /**
+ * Param for how to handle invalid data (unseen or NULL values) in features and label column
+ * of string type. Options are 'skip' (filter out rows with invalid data),
+ * 'error' (throw an error), or 'keep' (put invalid data in a special additional
+ * bucket, at index numLabels).
+ * Default: "error"
+ * @group param
+ */
+ @Since("2.3.0")
+ final override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid",
+ "How to handle invalid data (unseen or NULL values) in features and label column of string " +
+ "type. Options are 'skip' (filter out rows with invalid data), error (throw an error), " +
+ "or 'keep' (put invalid data in a special additional bucket, at index numLabels).",
+ ParamValidators.inArray(StringIndexer.supportedHandleInvalids))
+ setDefault(handleInvalid, StringIndexer.ERROR_INVALID)
/**
* Param for how to order categories of a string FEATURE column used by `StringIndexer`.
@@ -68,6 +112,7 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
"The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', " +
"RFormula drops the same category as R when encoding strings.",
ParamValidators.inArray(StringIndexer.supportedStringOrderType))
+ setDefault(stringIndexerOrderType, StringIndexer.frequencyDesc)
/** @group getParam */
@Since("2.3.0")
@@ -108,20 +153,12 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
@Experimental
@Since("1.5.0")
class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
- extends Estimator[RFormulaModel] with RFormulaBase with HasHandleInvalid
- with DefaultParamsWritable {
+ extends Estimator[RFormulaModel] with RFormulaBase with DefaultParamsWritable {
@Since("1.5.0")
def this() = this(Identifiable.randomUID("rFormula"))
/**
- * R formula parameter. The formula is provided in string form.
- * @group param
- */
- @Since("1.5.0")
- val formula: Param[String] = new Param(this, "formula", "R model formula")
-
- /**
* Sets the formula to use for this transformer. Must be called before use.
* @group setParam
* @param value an R formula in string form (e.g. "y ~ x + z")
@@ -129,26 +166,6 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
@Since("1.5.0")
def setFormula(value: String): this.type = set(formula, value)
- /** @group getParam */
- @Since("1.5.0")
- def getFormula: String = $(formula)
-
- /**
- * Param for how to handle invalid data (unseen or NULL values) in features and label column
- * of string type. Options are 'skip' (filter out rows with invalid data),
- * 'error' (throw an error), or 'keep' (put invalid data in a special additional
- * bucket, at index numLabels).
- * Default: "error"
- * @group param
- */
- @Since("2.3.0")
- override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "How to " +
- "handle invalid data (unseen or NULL values) in features and label column of string type. " +
- "Options are 'skip' (filter out rows with invalid data), error (throw an error), " +
- "or 'keep' (put invalid data in a special additional bucket, at index numLabels).",
- ParamValidators.inArray(StringIndexer.supportedHandleInvalids))
- setDefault(handleInvalid, StringIndexer.ERROR_INVALID)
-
/** @group setParam */
@Since("2.3.0")
def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
@@ -161,23 +178,6 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
@Since("1.5.0")
def setLabelCol(value: String): this.type = set(labelCol, value)
- /**
- * Force to index label whether it is numeric or string type.
- * Usually we index label only when it is string type.
- * If the formula was used by classification algorithms,
- * we can force to index label even it is numeric type by setting this param with true.
- * Default: false.
- * @group param
- */
- @Since("2.1.0")
- val forceIndexLabel: BooleanParam = new BooleanParam(this, "forceIndexLabel",
- "Force to index label whether it is numeric or string")
- setDefault(forceIndexLabel -> false)
-
- /** @group getParam */
- @Since("2.1.0")
- def getForceIndexLabel: Boolean = $(forceIndexLabel)
-
/** @group setParam */
@Since("2.1.0")
def setForceIndexLabel(value: Boolean): this.type = set(forceIndexLabel, value)
@@ -185,7 +185,6 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
/** @group setParam */
@Since("2.3.0")
def setStringIndexerOrderType(value: String): this.type = set(stringIndexerOrderType, value)
- setDefault(stringIndexerOrderType, StringIndexer.frequencyDesc)
/** Whether the formula specifies fitting an intercept. */
private[ml] def hasIntercept: Boolean = {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org