You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ml...@apache.org on 2016/03/10 12:21:31 UTC
spark git commit: [SPARK-11108][ML] OneHotEncoder should support
other numeric types
Repository: spark
Updated Branches:
refs/heads/master 9525c563d -> 9fe38aba1
[SPARK-11108][ML] OneHotEncoder should support other numeric types
Adding support for other numeric types:
* Integer
* Short
* Long
* Float
* Decimal
Author: sethah <se...@gmail.com>
Closes #9777 from sethah/SPARK-11108.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9fe38aba
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9fe38aba
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9fe38aba
Branch: refs/heads/master
Commit: 9fe38aba1f70a4cb19ec1f9df4814fce0b267b54
Parents: 9525c56
Author: sethah <se...@gmail.com>
Authored: Thu Mar 10 13:17:41 2016 +0200
Committer: Nick Pentreath <ni...@gmail.com>
Committed: Thu Mar 10 13:17:41 2016 +0200
----------------------------------------------------------------------
.../apache/spark/ml/feature/OneHotEncoder.scala | 9 ++++--
.../spark/ml/feature/OneHotEncoderSuite.scala | 29 ++++++++++++++++++++
2 files changed, 35 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/9fe38aba/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
index e9df161..fa5013d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
@@ -26,7 +26,7 @@ import org.apache.spark.ml.util._
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, udf}
-import org.apache.spark.sql.types.{DoubleType, StructType}
+import org.apache.spark.sql.types.{DoubleType, NumericType, StructType}
/**
* :: Experimental ::
@@ -70,7 +70,8 @@ class OneHotEncoder(override val uid: String) extends Transformer
val inputColName = $(inputCol)
val outputColName = $(outputCol)
- SchemaUtils.checkColumnType(schema, inputColName, DoubleType)
+ require(schema(inputColName).dataType.isInstanceOf[NumericType],
+ s"Input column must be of type NumericType but got ${schema(inputColName).dataType}")
val inputFields = schema.fields
require(!inputFields.exists(_.name == outputColName),
s"Output column $outputColName already exists.")
@@ -133,7 +134,9 @@ class OneHotEncoder(override val uid: String) extends Transformer
val numAttrs = dataset.select(col(inputColName).cast(DoubleType)).rdd.map(_.getDouble(0))
.aggregate(0.0)(
(m, x) => {
- assert(x >=0.0 && x == x.toInt,
+ assert(x <= Int.MaxValue,
+ s"OneHotEncoder only supports up to ${Int.MaxValue} indices, but got $x")
+ assert(x >= 0.0 && x == x.toInt,
s"Values from column $inputColName must be indices, but got $x.")
math.max(m, x)
},
http://git-wip-us.apache.org/repos/asf/spark/blob/9fe38aba/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala
index e238b33..49803ae 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala
@@ -25,6 +25,7 @@ import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.col
+import org.apache.spark.sql.types._
class OneHotEncoderSuite
extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
@@ -111,4 +112,32 @@ class OneHotEncoderSuite
.setDropLast(false)
testDefaultReadWrite(t)
}
+
+ test("OneHotEncoder with varying types") {
+ val df = stringIndexed()
+ val dfWithTypes = df
+ .withColumn("shortLabel", df("labelIndex").cast(ShortType))
+ .withColumn("longLabel", df("labelIndex").cast(LongType))
+ .withColumn("intLabel", df("labelIndex").cast(IntegerType))
+ .withColumn("floatLabel", df("labelIndex").cast(FloatType))
+ .withColumn("decimalLabel", df("labelIndex").cast(DecimalType(10, 0)))
+ val cols = Array("labelIndex", "shortLabel", "longLabel", "intLabel",
+ "floatLabel", "decimalLabel")
+ for (col <- cols) {
+ val encoder = new OneHotEncoder()
+ .setInputCol(col)
+ .setOutputCol("labelVec")
+ .setDropLast(false)
+ val encoded = encoder.transform(dfWithTypes)
+
+ val output = encoded.select("id", "labelVec").rdd.map { r =>
+ val vec = r.getAs[Vector](1)
+ (r.getInt(0), vec(0), vec(1), vec(2))
+ }.collect().toSet
+ // a -> 0, b -> 2, c -> 1
+ val expected = Set((0, 1.0, 0.0, 0.0), (1, 0.0, 0.0, 1.0), (2, 0.0, 1.0, 0.0),
+ (3, 1.0, 0.0, 0.0), (4, 1.0, 0.0, 0.0), (5, 0.0, 1.0, 0.0))
+ assert(output === expected)
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org