You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2020/01/26 04:02:13 UTC

[spark] branch branch-2.4 updated: [SPARK-30645][SPARKR][TESTS][WINDOWS] Move Unicode test data to external file

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-2.4 by this push:
     new 1b3ddcf  [SPARK-30645][SPARKR][TESTS][WINDOWS] Move Unicode test data to external file
1b3ddcf is described below

commit 1b3ddcff88fcb1266639323e0f1584c4e5a69971
Author: zero323 <ms...@gmail.com>
AuthorDate: Sun Jan 26 12:59:53 2020 +0900

    [SPARK-30645][SPARKR][TESTS][WINDOWS] Move Unicode test data to external file
    
    ### What changes were proposed in this pull request?
    
    Reference data for "collect() support Unicode characters" has been moved to an external file, to make test OS and locale independent.
    
    ### Why are the changes needed?
    
    As-is, embedded data is not properly encoded on Windows:
    
    ```
    library(SparkR)
    SparkR::sparkR.session()
    Sys.info()
    #           sysname           release           version
    #         "Windows"      "Server x64"     "build 17763"
    #          nodename           machine             login
    # "WIN-5BLT6Q610KH"          "x86-64"   "Administrator"
    #              user    effective_user
    #   "Administrator"   "Administrator"
    
    Sys.getlocale()
    
    # [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"
    
    lines <- c("{\"name\":\"안녕하세요\"}",
               "{\"name\":\"您好\", \"age\":30}",
               "{\"name\":\"こんにちは\", \"age\":19}",
               "{\"name\":\"Xin chào\"}")
    
    system(paste0("cat ", jsonPath))
    # {"name":"<U+C548><U+B155><U+D558><U+C138><U+C694>"}
    # {"name":"<U+60A8><U+597D>", "age":30}
    # {"name":"<U+3053><U+3093><U+306B><U+3061><U+306F>", "age":19}
    # {"name":"Xin chào"}
    # [1] 0
    
    jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
    writeLines(lines, jsonPath)
    
    df <- read.df(jsonPath, "json")
    
    printSchema(df)
    # root
    #  |-- _corrupt_record: string (nullable = true)
    #  |-- age: long (nullable = true)
    #  |-- name: string (nullable = true)
    
    head(df)
    #              _corrupt_record age                                     name
    # 1                       <NA>  NA <U+C548><U+B155><U+D558><U+C138><U+C694>
    # 2                       <NA>  30                         <U+60A8><U+597D>
    # 3                       <NA>  19 <U+3053><U+3093><U+306B><U+3061><U+306F>
    # 4 {"name":"Xin ch<U+FFFD>o"}  NA                                     <NA>
    ```
    This can be reproduced outside tests (Windows Server 2019, English locale), and causes failures, when `testthat` is updated to 2.x (https://github.com/apache/spark/pull/27359). Somehow problem is not picked-up when test is executed on `testthat` 1.0.2.
    
    ### Does this PR introduce any user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Running modified test, manual testing.
    
    ### Note
    
    Alternative seems to be to used bytes, but it hasn't been properly tested.
    
    ```
    test_that("collect() support Unicode characters", {
    
      lines <- markUtf8(c(
        '{"name": "안녕하세요"}',
        '{"name": "您好", "age": 30}',
        '{"name": "こんにちは", "age": 19}',
        '{"name": "Xin ch\xc3\xa0o"}'
      ))
    
      jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
      writeLines(lines, jsonPath, useBytes = TRUE)
    
      expected <- regmatches(lines, regexec('(?<="name": ").*?(?=")', lines, perl = TRUE))
    
      df <- read.df(jsonPath, "json")
      rdf <- collect(df)
      expect_true(is.data.frame(rdf))
    
      rdf$name <- markUtf8(rdf$name)
      expect_equal(rdf$name[1], expected[[1]])
      expect_equal(rdf$name[2], expected[[2]])
      expect_equal(rdf$name[3], expected[[3]])
      expect_equal(rdf$name[4], expected[[4]])
    
      df1 <- createDataFrame(rdf)
      expect_equal(
        collect(
          where(df1, df1$name == expected[[2]])
        )$name,
        expected[[2]]
      )
    })
    ```
    
    Closes #27362 from zero323/SPARK-30645.
    
    Authored-by: zero323 <ms...@gmail.com>
    Signed-off-by: HyukjinKwon <gu...@apache.org>
    (cherry picked from commit 40b1f4d87e0f24e4e7e2fd6fe37cc5398ae778f8)
    Signed-off-by: HyukjinKwon <gu...@apache.org>
---
 R/pkg/tests/fulltests/data/test_utils_utf.json |  4 ++++
 R/pkg/tests/fulltests/test_sparkSQL.R          | 29 ++++++++++++++++----------
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/R/pkg/tests/fulltests/data/test_utils_utf.json b/R/pkg/tests/fulltests/data/test_utils_utf.json
new file mode 100644
index 0000000..b78352e
--- /dev/null
+++ b/R/pkg/tests/fulltests/data/test_utils_utf.json
@@ -0,0 +1,4 @@
+{"name": "안녕하세요"}
+{"name": "您好", "age": 30}
+{"name": "こんにちは", "age": 19}
+{"name": "Xin chào"}
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index 3f7eee3..cd262a4 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -875,24 +875,31 @@ test_that("collect() and take() on a DataFrame return the same number of rows an
 })
 
 test_that("collect() support Unicode characters", {
-  lines <- c("{\"name\":\"안녕하세요\"}",
-             "{\"name\":\"您好\", \"age\":30}",
-             "{\"name\":\"こんにちは\", \"age\":19}",
-             "{\"name\":\"Xin chào\"}")
+  jsonPath <- file.path(
+    Sys.getenv("SPARK_HOME"),
+    "R", "pkg", "tests", "fulltests", "data",
+    "test_utils_utf.json"
+  )
+
+  lines <- readLines(jsonPath, encoding = "UTF-8")
 
-  jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-  writeLines(lines, jsonPath)
+  expected <- regmatches(lines, gregexpr('(?<="name": ").*?(?=")', lines, perl = TRUE))
 
   df <- read.df(jsonPath, "json")
   rdf <- collect(df)
   expect_true(is.data.frame(rdf))
-  expect_equal(rdf$name[1], markUtf8("안녕하세요"))
-  expect_equal(rdf$name[2], markUtf8("您好"))
-  expect_equal(rdf$name[3], markUtf8("こんにちは"))
-  expect_equal(rdf$name[4], markUtf8("Xin chào"))
+  expect_equal(rdf$name[1], expected[[1]])
+  expect_equal(rdf$name[2], expected[[2]])
+  expect_equal(rdf$name[3], expected[[3]])
+  expect_equal(rdf$name[4], expected[[4]])
 
   df1 <- createDataFrame(rdf)
-  expect_equal(collect(where(df1, df1$name == markUtf8("您好")))$name, markUtf8("您好"))
+  expect_equal(
+    collect(
+      where(df1, df1$name == expected[[2]])
+    )$name,
+    expected[[2]]
+  )
 })
 
 test_that("multiple pipeline transformations result in an RDD with the correct values", {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org