You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2019/07/26 03:13:29 UTC
[spark] branch master updated: [SPARK-28365][ML] Fallback locale to
en_US in StopWordsRemover if system default locale isn't in available
locales in JVM
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new ded1a74 [SPARK-28365][ML] Fallback locale to en_US in StopWordsRemover if system default locale isn't in available locales in JVM
ded1a74 is described below
commit ded1a7495b443f4735057eb5520f31df5b9860d2
Author: Liang-Chi Hsieh <vi...@gmail.com>
AuthorDate: Fri Jul 26 12:13:10 2019 +0900
[SPARK-28365][ML] Fallback locale to en_US in StopWordsRemover if system default locale isn't in available locales in JVM
## What changes were proposed in this pull request?
Because the local default locale isn't in available locales at `Locale`, when I did some tests locally with python code, `StopWordsRemover` related python test hits some errors, like:
```
Traceback (most recent call last):
File "/spark-1/python/pyspark/ml/tests/test_feature.py", line 87, in test_stopwordsremover
stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output")
File "/spark-1/python/pyspark/__init__.py", line 111, in wrapper
return func(self, **kwargs)
File "/spark-1/python/pyspark/ml/feature.py", line 2646, in __init__
self.uid)
File "/spark-1/python/pyspark/ml/wrapper.py", line 67, in _new_java_obj
return java_obj(*java_args)
File /spark-1/python/lib/py4j-0.10.8.1-src.zip/py4j/java_gateway.py", line 1554, in __call__
answer, self._gateway_client, None, self._fqn)
File "/spark-1/python/pyspark/sql/utils.py", line 93, in deco
raise converted
pyspark.sql.utils.IllegalArgumentException: 'StopWordsRemover_4598673ee802 parameter locale given invalid value en_TW.'
```
As per HyukjinKwon's advice, instead of setting up locale to pass test, it is better to have a workable locale if system default locale can't be found in available locales in JVM. Otherwise, users have to manually change system locale or accessing a private property _jvm in PySpark.
## How was this patch tested?
Added test and manual test.
```
scala> val remover = new StopWordsRemover().setInputCol("raw").setOutputCol("filtered")
19/07/14 19:20:03 WARN StopWordsRemover: Default locale set was [en_TW]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.
```
Closes #25133 from viirya/pytest-default-locale.
Authored-by: Liang-Chi Hsieh <vi...@gmail.com>
Signed-off-by: HyukjinKwon <gu...@apache.org>
---
.../apache/spark/ml/feature/StopWordsRemover.scala | 20 ++++++++++++++++++--
.../spark/ml/feature/StopWordsRemoverSuite.scala | 17 +++++++++++++++++
2 files changed, 35 insertions(+), 2 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
index 6669d40..f95e03a 100755
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
@@ -89,7 +89,8 @@ class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String
/**
* Locale of the input for case insensitive matching. Ignored when [[caseSensitive]]
* is true.
- * Default: Locale.getDefault.toString
+ * Default: the string of default locale (`Locale.getDefault`), or `Locale.US` if default locale
+ * is not in available locales in JVM.
* @group param
*/
@Since("2.4.0")
@@ -105,8 +106,23 @@ class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String
@Since("2.4.0")
def getLocale: String = $(locale)
+ /**
+ * Returns system default locale, or `Locale.US` if the default locale is not in available locales
+ * in JVM.
+ */
+ private val getDefaultOrUS: Locale = {
+ if (Locale.getAvailableLocales.contains(Locale.getDefault)) {
+ Locale.getDefault
+ } else {
+ logWarning(s"Default locale set was [${Locale.getDefault.toString}]; however, it was " +
+ "not found in available locales in JVM, falling back to en_US locale. Set param `locale` " +
+ "in order to respect another locale.")
+ Locale.US
+ }
+ }
+
setDefault(stopWords -> StopWordsRemover.loadDefaultStopWords("english"),
- caseSensitive -> false, locale -> Locale.getDefault.toString)
+ caseSensitive -> false, locale -> getDefaultOrUS.toString)
@Since("2.0.0")
override def transform(dataset: Dataset[_]): DataFrame = {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
index 20972d1..6d0b83e 100755
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
@@ -17,6 +17,8 @@
package org.apache.spark.ml.feature
+import java.util.Locale
+
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
import org.apache.spark.sql.{DataFrame, Row}
@@ -200,4 +202,19 @@ class StopWordsRemoverSuite extends MLTest with DefaultReadWriteTest {
s"requirement failed: Column $outputCol already exists.",
"expected")
}
+
+ test("SPARK-28365: Fallback to en_US if default locale isn't in available locales") {
+ val oldDefault = Locale.getDefault()
+ try {
+ val dummyLocale = Locale.forLanguageTag("test")
+ Locale.setDefault(dummyLocale)
+
+ val remover = new StopWordsRemover()
+ .setInputCol("raw")
+ .setOutputCol("filtered")
+ assert(remover.getLocale == "en_US")
+ } finally {
+ Locale.setDefault(oldDefault)
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org