You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2019/07/26 03:13:29 UTC

[spark] branch master updated: [SPARK-28365][ML] Fallback locale to en_US in StopWordsRemover if system default locale isn't in available locales in JVM

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new ded1a74  [SPARK-28365][ML] Fallback locale to en_US in StopWordsRemover if system default locale isn't in available locales in JVM
ded1a74 is described below

commit ded1a7495b443f4735057eb5520f31df5b9860d2
Author: Liang-Chi Hsieh <vi...@gmail.com>
AuthorDate: Fri Jul 26 12:13:10 2019 +0900

    [SPARK-28365][ML] Fallback locale to en_US in StopWordsRemover if system default locale isn't in available locales in JVM
    
    ## What changes were proposed in this pull request?
    
    Because the local default locale isn't in available locales at `Locale`, when I did some tests locally with python code, `StopWordsRemover` related python test hits some errors, like:
    
    ```
    Traceback (most recent call last):
      File "/spark-1/python/pyspark/ml/tests/test_feature.py", line 87, in test_stopwordsremover
        stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output")
      File "/spark-1/python/pyspark/__init__.py", line 111, in wrapper
        return func(self, **kwargs)
      File "/spark-1/python/pyspark/ml/feature.py", line 2646, in __init__
        self.uid)
      File "/spark-1/python/pyspark/ml/wrapper.py", line 67, in _new_java_obj
        return java_obj(*java_args)
      File /spark-1/python/lib/py4j-0.10.8.1-src.zip/py4j/java_gateway.py", line 1554, in __call__
        answer, self._gateway_client, None, self._fqn)
      File "/spark-1/python/pyspark/sql/utils.py", line 93, in deco
        raise converted
    pyspark.sql.utils.IllegalArgumentException: 'StopWordsRemover_4598673ee802 parameter locale given invalid value en_TW.'
    ```
    
    As per HyukjinKwon's advice, instead of setting up locale to pass test, it is better to have a workable locale if system default locale can't be found in available locales in JVM. Otherwise, users have to manually change system locale or accessing a private property _jvm in PySpark.
    
    ## How was this patch tested?
    
    Added test and manual test.
    
    ```
    scala> val remover = new StopWordsRemover().setInputCol("raw").setOutputCol("filtered")
    19/07/14 19:20:03 WARN StopWordsRemover: Default locale set was [en_TW]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.
    ```
    
    Closes #25133 from viirya/pytest-default-locale.
    
    Authored-by: Liang-Chi Hsieh <vi...@gmail.com>
    Signed-off-by: HyukjinKwon <gu...@apache.org>
---
 .../apache/spark/ml/feature/StopWordsRemover.scala   | 20 ++++++++++++++++++--
 .../spark/ml/feature/StopWordsRemoverSuite.scala     | 17 +++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
index 6669d40..f95e03a 100755
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
@@ -89,7 +89,8 @@ class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String
   /**
    * Locale of the input for case insensitive matching. Ignored when [[caseSensitive]]
    * is true.
-   * Default: Locale.getDefault.toString
+   * Default: the string of default locale (`Locale.getDefault`), or `Locale.US` if default locale
+   * is not in available locales in JVM.
    * @group param
    */
   @Since("2.4.0")
@@ -105,8 +106,23 @@ class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String
   @Since("2.4.0")
   def getLocale: String = $(locale)
 
+  /**
+   * Returns system default locale, or `Locale.US` if the default locale is not in available locales
+   * in JVM.
+   */
+  private val getDefaultOrUS: Locale = {
+    if (Locale.getAvailableLocales.contains(Locale.getDefault)) {
+      Locale.getDefault
+    } else {
+      logWarning(s"Default locale set was [${Locale.getDefault.toString}]; however, it was " +
+        "not found in available locales in JVM, falling back to en_US locale. Set param `locale` " +
+        "in order to respect another locale.")
+      Locale.US
+    }
+  }
+
   setDefault(stopWords -> StopWordsRemover.loadDefaultStopWords("english"),
-    caseSensitive -> false, locale -> Locale.getDefault.toString)
+    caseSensitive -> false, locale -> getDefaultOrUS.toString)
 
   @Since("2.0.0")
   override def transform(dataset: Dataset[_]): DataFrame = {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
index 20972d1..6d0b83e 100755
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.ml.feature
 
+import java.util.Locale
+
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
 import org.apache.spark.sql.{DataFrame, Row}
 
@@ -200,4 +202,19 @@ class StopWordsRemoverSuite extends MLTest with DefaultReadWriteTest {
       s"requirement failed: Column $outputCol already exists.",
       "expected")
   }
+
+  test("SPARK-28365: Fallback to en_US if default locale isn't in available locales") {
+    val oldDefault = Locale.getDefault()
+    try {
+      val dummyLocale = Locale.forLanguageTag("test")
+      Locale.setDefault(dummyLocale)
+
+      val remover = new StopWordsRemover()
+        .setInputCol("raw")
+        .setOutputCol("filtered")
+      assert(remover.getLocale == "en_US")
+    } finally {
+      Locale.setDefault(oldDefault)
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org