You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2020/01/26 04:02:13 UTC
[spark] branch branch-2.4 updated:
[SPARK-30645][SPARKR][TESTS][WINDOWS] Move Unicode test data to external
file
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-2.4 by this push:
new 1b3ddcf [SPARK-30645][SPARKR][TESTS][WINDOWS] Move Unicode test data to external file
1b3ddcf is described below
commit 1b3ddcff88fcb1266639323e0f1584c4e5a69971
Author: zero323 <ms...@gmail.com>
AuthorDate: Sun Jan 26 12:59:53 2020 +0900
[SPARK-30645][SPARKR][TESTS][WINDOWS] Move Unicode test data to external file
### What changes were proposed in this pull request?
Reference data for "collect() support Unicode characters" has been moved to an external file, to make test OS and locale independent.
### Why are the changes needed?
As-is, embedded data is not properly encoded on Windows:
```
library(SparkR)
SparkR::sparkR.session()
Sys.info()
# sysname release version
# "Windows" "Server x64" "build 17763"
# nodename machine login
# "WIN-5BLT6Q610KH" "x86-64" "Administrator"
# user effective_user
# "Administrator" "Administrator"
Sys.getlocale()
# [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"
lines <- c("{\"name\":\"안녕하세요\"}",
"{\"name\":\"您好\", \"age\":30}",
"{\"name\":\"こんにちは\", \"age\":19}",
"{\"name\":\"Xin chào\"}")
system(paste0("cat ", jsonPath))
# {"name":"<U+C548><U+B155><U+D558><U+C138><U+C694>"}
# {"name":"<U+60A8><U+597D>", "age":30}
# {"name":"<U+3053><U+3093><U+306B><U+3061><U+306F>", "age":19}
# {"name":"Xin chào"}
# [1] 0
jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
writeLines(lines, jsonPath)
df <- read.df(jsonPath, "json")
printSchema(df)
# root
# |-- _corrupt_record: string (nullable = true)
# |-- age: long (nullable = true)
# |-- name: string (nullable = true)
head(df)
# _corrupt_record age name
# 1 <NA> NA <U+C548><U+B155><U+D558><U+C138><U+C694>
# 2 <NA> 30 <U+60A8><U+597D>
# 3 <NA> 19 <U+3053><U+3093><U+306B><U+3061><U+306F>
# 4 {"name":"Xin ch<U+FFFD>o"} NA <NA>
```
This can be reproduced outside tests (Windows Server 2019, English locale), and causes failures, when `testthat` is updated to 2.x (https://github.com/apache/spark/pull/27359). Somehow problem is not picked-up when test is executed on `testthat` 1.0.2.
### Does this PR introduce any user-facing change?
No.
### How was this patch tested?
Running modified test, manual testing.
### Note
Alternative seems to be to used bytes, but it hasn't been properly tested.
```
test_that("collect() support Unicode characters", {
lines <- markUtf8(c(
'{"name": "안녕하세요"}',
'{"name": "您好", "age": 30}',
'{"name": "こんにちは", "age": 19}',
'{"name": "Xin ch\xc3\xa0o"}'
))
jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
writeLines(lines, jsonPath, useBytes = TRUE)
expected <- regmatches(lines, regexec('(?<="name": ").*?(?=")', lines, perl = TRUE))
df <- read.df(jsonPath, "json")
rdf <- collect(df)
expect_true(is.data.frame(rdf))
rdf$name <- markUtf8(rdf$name)
expect_equal(rdf$name[1], expected[[1]])
expect_equal(rdf$name[2], expected[[2]])
expect_equal(rdf$name[3], expected[[3]])
expect_equal(rdf$name[4], expected[[4]])
df1 <- createDataFrame(rdf)
expect_equal(
collect(
where(df1, df1$name == expected[[2]])
)$name,
expected[[2]]
)
})
```
Closes #27362 from zero323/SPARK-30645.
Authored-by: zero323 <ms...@gmail.com>
Signed-off-by: HyukjinKwon <gu...@apache.org>
(cherry picked from commit 40b1f4d87e0f24e4e7e2fd6fe37cc5398ae778f8)
Signed-off-by: HyukjinKwon <gu...@apache.org>
---
R/pkg/tests/fulltests/data/test_utils_utf.json | 4 ++++
R/pkg/tests/fulltests/test_sparkSQL.R | 29 ++++++++++++++++----------
2 files changed, 22 insertions(+), 11 deletions(-)
diff --git a/R/pkg/tests/fulltests/data/test_utils_utf.json b/R/pkg/tests/fulltests/data/test_utils_utf.json
new file mode 100644
index 0000000..b78352e
--- /dev/null
+++ b/R/pkg/tests/fulltests/data/test_utils_utf.json
@@ -0,0 +1,4 @@
+{"name": "안녕하세요"}
+{"name": "您好", "age": 30}
+{"name": "こんにちは", "age": 19}
+{"name": "Xin chào"}
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index 3f7eee3..cd262a4 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -875,24 +875,31 @@ test_that("collect() and take() on a DataFrame return the same number of rows an
})
test_that("collect() support Unicode characters", {
- lines <- c("{\"name\":\"안녕하세요\"}",
- "{\"name\":\"您好\", \"age\":30}",
- "{\"name\":\"こんにちは\", \"age\":19}",
- "{\"name\":\"Xin chào\"}")
+ jsonPath <- file.path(
+ Sys.getenv("SPARK_HOME"),
+ "R", "pkg", "tests", "fulltests", "data",
+ "test_utils_utf.json"
+ )
+
+ lines <- readLines(jsonPath, encoding = "UTF-8")
- jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
- writeLines(lines, jsonPath)
+ expected <- regmatches(lines, gregexpr('(?<="name": ").*?(?=")', lines, perl = TRUE))
df <- read.df(jsonPath, "json")
rdf <- collect(df)
expect_true(is.data.frame(rdf))
- expect_equal(rdf$name[1], markUtf8("안녕하세요"))
- expect_equal(rdf$name[2], markUtf8("您好"))
- expect_equal(rdf$name[3], markUtf8("こんにちは"))
- expect_equal(rdf$name[4], markUtf8("Xin chào"))
+ expect_equal(rdf$name[1], expected[[1]])
+ expect_equal(rdf$name[2], expected[[2]])
+ expect_equal(rdf$name[3], expected[[3]])
+ expect_equal(rdf$name[4], expected[[4]])
df1 <- createDataFrame(rdf)
- expect_equal(collect(where(df1, df1$name == markUtf8("您好")))$name, markUtf8("您好"))
+ expect_equal(
+ collect(
+ where(df1, df1$name == expected[[2]])
+ )$name,
+ expected[[2]]
+ )
})
test_that("multiple pipeline transformations result in an RDD with the correct values", {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org