You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by th...@apache.org on 2022/05/10 13:58:14 UTC
[arrow] branch master updated: ARROW-16489: [R] wrong encoding causes parsing error
This is an automated email from the ASF dual-hosted git repository.
thisisnic pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 3df3395edd ARROW-16489: [R] wrong encoding causes parsing error
3df3395edd is described below
commit 3df3395edd9526689829dc96b8e6151781ae6303
Author: Jacob Wujciak-Jens <ja...@wujciak.de>
AuthorDate: Tue May 10 14:58:04 2022 +0100
ARROW-16489: [R] wrong encoding causes parsing error
Closes #13082 from assignUser/ARROW-16489-fix-encoding
Authored-by: Jacob Wujciak-Jens <ja...@wujciak.de>
Signed-off-by: Nic Crane <th...@gmail.com>
---
r/.lintr | 1 -
r/.styler_excludes.R | 2 +-
r/tests/testthat/latin1.R | 76 -------------------------------------
r/tests/testthat/test-utf.R | 61 +++++++++++++++++++++++++++--
r/vignettes/developers/workflow.Rmd | 4 +-
5 files changed, 60 insertions(+), 84 deletions(-)
diff --git a/r/.lintr b/r/.lintr
index b7c046f8e2..0298fd7f99 100644
--- a/r/.lintr
+++ b/r/.lintr
@@ -26,7 +26,6 @@ linters: with_defaults(
open_curly_linter = NULL # styler and lintr conflict on this (https://github.com/r-lib/styler/issues/549#issuecomment-537191536)
)
exclusions: list(
- "tests/testthat/latin1.R",
"R/arrowExports.R",
"data-raw/codegen.R"
)
diff --git a/r/.styler_excludes.R b/r/.styler_excludes.R
index 19cd1ffa55..392147f243 100644
--- a/r/.styler_excludes.R
+++ b/r/.styler_excludes.R
@@ -15,4 +15,4 @@
# specific language governing permissions and limitations
# under the License.
-c("tests/testthat/latin1.R", "data-raw/codegen.R")
\ No newline at end of file
+"data-raw/codegen.R"
diff --git a/r/tests/testthat/latin1.R b/r/tests/testthat/latin1.R
deleted file mode 100644
index 150192d314..0000000000
--- a/r/tests/testthat/latin1.R
+++ /dev/null
@@ -1,76 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-x <- iconv("Veitingasta�ir", to = "latin1")
-df <- tibble::tibble(
- chr = x,
- fct = as.factor(x)
-)
-names(df) <- iconv(paste(x, names(df), sep = "_"), to = "latin1")
-df_struct <- tibble::tibble(a = df)
-
-raw_schema <- list(utf8(), dictionary(int8(), utf8()))
-names(raw_schema) <- names(df)
-
-# Confirm setup
-expect_identical(Encoding(x), "latin1")
-expect_identical(Encoding(names(df)), c("latin1", "latin1"))
-expect_identical(Encoding(df[[1]]), "latin1")
-expect_identical(Encoding(levels(df[[2]])), "latin1")
-
-# Array
-expect_identical(as.vector(Array$create(x)), x)
-# struct
-expect_identical(as.vector(Array$create(df)), df)
-
-# ChunkedArray
-expect_identical(as.vector(ChunkedArray$create(x)), x)
-# struct
-expect_identical(as.vector(ChunkedArray$create(df)), df)
-
-# Table (including field name)
-expect_identical(as.data.frame(Table$create(df)), df)
-expect_identical(as.data.frame(Table$create(df_struct)), df_struct)
-
-# RecordBatch
-expect_identical(as.data.frame(record_batch(df)), df)
-expect_identical(as.data.frame(record_batch(df_struct)), df_struct)
-
-# Schema field name
-df_schema <- do.call(schema, raw_schema)
-expect_identical(names(df_schema), names(df))
-
-df_struct_schema <- schema(a = do.call(struct, raw_schema))
-# StructType doesn't expose names (in C++)
-# expect_identical(names(df_struct_schema$a), names(df))
-
-# Create table/batch with schema
-expect_identical(as.data.frame(Table$create(df, schema = df_schema)), df)
-expect_identical(as.data.frame(Table$create(df_struct, schema = df_struct_schema)), df_struct)
-expect_identical(as.data.frame(record_batch(df, schema = df_schema)), df)
-expect_identical(as.data.frame(record_batch(df_struct, schema = df_struct_schema)), df_struct)
-
-# Serialization
-feather_file <- tempfile()
-write_feather(df_struct, feather_file)
-expect_identical(read_feather(feather_file), df_struct)
-
-if (arrow_with_parquet()) {
- parquet_file <- tempfile()
- write_parquet(df, parquet_file) # Parquet doesn't yet support nested types
- expect_identical(read_parquet(parquet_file), df)
-}
diff --git a/r/tests/testthat/test-utf.R b/r/tests/testthat/test-utf.R
index 69d196274a..f7553da5b4 100644
--- a/r/tests/testthat/test-utf.R
+++ b/r/tests/testthat/test-utf.R
@@ -17,8 +17,61 @@
test_that("We handle non-UTF strings", {
- # Move the code with non-UTF strings to a separate file so that we don't
- # get a parse error on *cough* certain platforms
- skip_on_cran()
- source("latin1.R", encoding = "latin1")
+ x <- iconv("Veitingastaðir", to = "latin1")
+ df <- tibble::tibble(
+ chr = x,
+ fct = as.factor(x)
+ )
+ names(df) <- iconv(paste(x, names(df), sep = "_"), to = "latin1")
+ df_struct <- tibble::tibble(a = df)
+
+ raw_schema <- list(utf8(), dictionary(int8(), utf8()))
+ names(raw_schema) <- names(df)
+
+ # Confirm setup
+ expect_identical(Encoding(x), "latin1")
+ expect_identical(Encoding(names(df)), c("latin1", "latin1"))
+ expect_identical(Encoding(df[[1]]), "latin1")
+ expect_identical(Encoding(levels(df[[2]])), "latin1")
+
+ # Array
+ expect_identical(as.vector(Array$create(x)), x)
+ # struct
+ expect_identical(as.vector(Array$create(df)), df)
+
+ # ChunkedArray
+ expect_identical(as.vector(ChunkedArray$create(x)), x)
+ # struct
+ expect_identical(as.vector(ChunkedArray$create(df)), df)
+
+ # Table (including field name)
+ expect_identical(as.data.frame(Table$create(df)), df)
+ expect_identical(as.data.frame(Table$create(df_struct)), df_struct)
+
+ # RecordBatch
+ expect_identical(as.data.frame(record_batch(df)), df)
+ expect_identical(as.data.frame(record_batch(df_struct)), df_struct)
+
+ # Schema field name
+ df_schema <- do.call(schema, raw_schema)
+ expect_identical(names(df_schema), names(df))
+
+ df_struct_schema <- schema(a = do.call(struct, raw_schema))
+
+ # Create table/batch with schema
+ expect_identical(as.data.frame(Table$create(df, schema = df_schema)), df)
+ expect_identical(as.data.frame(Table$create(df_struct, schema = df_struct_schema)), df_struct)
+ expect_identical(as.data.frame(record_batch(df, schema = df_schema)), df)
+ expect_identical(as.data.frame(record_batch(df_struct, schema = df_struct_schema)), df_struct)
+
+ # Serialization
+ feather_file <- tempfile()
+ write_feather(df_struct, feather_file)
+ expect_identical(read_feather(feather_file), df_struct)
+
+ if (arrow_with_parquet()) {
+ parquet_file <- tempfile()
+ write_parquet(df, parquet_file) # Parquet doesn't yet support nested types
+ expect_identical(read_parquet(parquet_file), df)
+ }
})
diff --git a/r/vignettes/developers/workflow.Rmd b/r/vignettes/developers/workflow.Rmd
index b98e82e7b4..b7e0a27d76 100644
--- a/r/vignettes/developers/workflow.Rmd
+++ b/r/vignettes/developers/workflow.Rmd
@@ -71,8 +71,8 @@ make style-all # (for all files)
or in R:
```r
-# note the two excluded files which should not be styled
-styler::style_pkg(exclude_files = c("tests/testthat/latin1.R", "data-raw/codegen.R"))
+# note the file that should not be styled
+styler::style_pkg(exclude_files = c("data-raw/codegen.R"))
```
The styler package will fix many styling errors, thought not all lintr errors are automatically fixable with styler. The list of files we intentionally do not style is in `r/.styler_excludes.R`.