You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by th...@apache.org on 2022/05/10 13:58:14 UTC
[arrow] branch master updated: ARROW-16489: [R] wrong encoding causes parsing error

This is an automated email from the ASF dual-hosted git repository.

thisisnic pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 3df3395edd ARROW-16489: [R] wrong encoding causes parsing error
3df3395edd is described below

commit 3df3395edd9526689829dc96b8e6151781ae6303
Author: Jacob Wujciak-Jens <ja...@wujciak.de>
AuthorDate: Tue May 10 14:58:04 2022 +0100

    ARROW-16489: [R] wrong encoding causes parsing error
    
    Closes #13082 from assignUser/ARROW-16489-fix-encoding
    
    Authored-by: Jacob Wujciak-Jens <ja...@wujciak.de>
    Signed-off-by: Nic Crane <th...@gmail.com>
---
 r/.lintr                            |  1 -
 r/.styler_excludes.R                |  2 +-
 r/tests/testthat/latin1.R           | 76 -------------------------------------
 r/tests/testthat/test-utf.R         | 61 +++++++++++++++++++++++++++--
 r/vignettes/developers/workflow.Rmd |  4 +-
 5 files changed, 60 insertions(+), 84 deletions(-)

diff --git a/r/.lintr b/r/.lintr
index b7c046f8e2..0298fd7f99 100644
--- a/r/.lintr
+++ b/r/.lintr
@@ -26,7 +26,6 @@ linters: with_defaults(
   open_curly_linter = NULL # styler and lintr conflict on this (https://github.com/r-lib/styler/issues/549#issuecomment-537191536)
   )
 exclusions: list(
-  "tests/testthat/latin1.R",
   "R/arrowExports.R",
   "data-raw/codegen.R"
   )
diff --git a/r/.styler_excludes.R b/r/.styler_excludes.R
index 19cd1ffa55..392147f243 100644
--- a/r/.styler_excludes.R
+++ b/r/.styler_excludes.R
@@ -15,4 +15,4 @@
 #  specific language governing permissions and limitations
 #  under the License.
 
-c("tests/testthat/latin1.R", "data-raw/codegen.R")
\ No newline at end of file
+"data-raw/codegen.R"
diff --git a/r/tests/testthat/latin1.R b/r/tests/testthat/latin1.R
deleted file mode 100644
index 150192d314..0000000000
--- a/r/tests/testthat/latin1.R
+++ /dev/null
@@ -1,76 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-x <- iconv("Veitingasta�ir", to = "latin1")
-df <- tibble::tibble(
-  chr = x,
-  fct = as.factor(x)
-)
-names(df) <- iconv(paste(x, names(df), sep = "_"), to = "latin1")
-df_struct <- tibble::tibble(a = df)
-
-raw_schema <- list(utf8(), dictionary(int8(), utf8()))
-names(raw_schema) <- names(df)
-
-# Confirm setup
-expect_identical(Encoding(x), "latin1")
-expect_identical(Encoding(names(df)), c("latin1", "latin1"))
-expect_identical(Encoding(df[[1]]), "latin1")
-expect_identical(Encoding(levels(df[[2]])), "latin1")
-
-# Array
-expect_identical(as.vector(Array$create(x)), x)
-# struct
-expect_identical(as.vector(Array$create(df)), df)
-
-# ChunkedArray
-expect_identical(as.vector(ChunkedArray$create(x)), x)
-# struct
-expect_identical(as.vector(ChunkedArray$create(df)), df)
-
-# Table (including field name)
-expect_identical(as.data.frame(Table$create(df)), df)
-expect_identical(as.data.frame(Table$create(df_struct)), df_struct)
-
-# RecordBatch
-expect_identical(as.data.frame(record_batch(df)), df)
-expect_identical(as.data.frame(record_batch(df_struct)), df_struct)
-
-# Schema field name
-df_schema <- do.call(schema, raw_schema)
-expect_identical(names(df_schema), names(df))
-
-df_struct_schema <- schema(a = do.call(struct, raw_schema))
-# StructType doesn't expose names (in C++)
-# expect_identical(names(df_struct_schema$a), names(df))
-
-# Create table/batch with schema
-expect_identical(as.data.frame(Table$create(df, schema = df_schema)), df)
-expect_identical(as.data.frame(Table$create(df_struct, schema = df_struct_schema)), df_struct)
-expect_identical(as.data.frame(record_batch(df, schema = df_schema)), df)
-expect_identical(as.data.frame(record_batch(df_struct, schema = df_struct_schema)), df_struct)
-
-# Serialization
-feather_file <- tempfile()
-write_feather(df_struct, feather_file)
-expect_identical(read_feather(feather_file), df_struct)
-
-if (arrow_with_parquet()) {
-	parquet_file <- tempfile()
-	write_parquet(df, parquet_file) # Parquet doesn't yet support nested types
-	expect_identical(read_parquet(parquet_file), df)
-}
diff --git a/r/tests/testthat/test-utf.R b/r/tests/testthat/test-utf.R
index 69d196274a..f7553da5b4 100644
--- a/r/tests/testthat/test-utf.R
+++ b/r/tests/testthat/test-utf.R
@@ -17,8 +17,61 @@
 
 
 test_that("We handle non-UTF strings", {
-  # Move the code with non-UTF strings to a separate file so that we don't
-  # get a parse error on *cough* certain platforms
-  skip_on_cran()
-  source("latin1.R", encoding = "latin1")
+  x <- iconv("Veitingastaðir", to = "latin1")
+  df <- tibble::tibble(
+    chr = x,
+    fct = as.factor(x)
+  )
+  names(df) <- iconv(paste(x, names(df), sep = "_"), to = "latin1")
+  df_struct <- tibble::tibble(a = df)
+
+  raw_schema <- list(utf8(), dictionary(int8(), utf8()))
+  names(raw_schema) <- names(df)
+
+  # Confirm setup
+  expect_identical(Encoding(x), "latin1")
+  expect_identical(Encoding(names(df)), c("latin1", "latin1"))
+  expect_identical(Encoding(df[[1]]), "latin1")
+  expect_identical(Encoding(levels(df[[2]])), "latin1")
+
+  # Array
+  expect_identical(as.vector(Array$create(x)), x)
+  # struct
+  expect_identical(as.vector(Array$create(df)), df)
+
+  # ChunkedArray
+  expect_identical(as.vector(ChunkedArray$create(x)), x)
+  # struct
+  expect_identical(as.vector(ChunkedArray$create(df)), df)
+
+  # Table (including field name)
+  expect_identical(as.data.frame(Table$create(df)), df)
+  expect_identical(as.data.frame(Table$create(df_struct)), df_struct)
+
+  # RecordBatch
+  expect_identical(as.data.frame(record_batch(df)), df)
+  expect_identical(as.data.frame(record_batch(df_struct)), df_struct)
+
+  # Schema field name
+  df_schema <- do.call(schema, raw_schema)
+  expect_identical(names(df_schema), names(df))
+
+  df_struct_schema <- schema(a = do.call(struct, raw_schema))
+
+  # Create table/batch with schema
+  expect_identical(as.data.frame(Table$create(df, schema = df_schema)), df)
+  expect_identical(as.data.frame(Table$create(df_struct, schema = df_struct_schema)), df_struct)
+  expect_identical(as.data.frame(record_batch(df, schema = df_schema)), df)
+  expect_identical(as.data.frame(record_batch(df_struct, schema = df_struct_schema)), df_struct)
+
+  # Serialization
+  feather_file <- tempfile()
+  write_feather(df_struct, feather_file)
+  expect_identical(read_feather(feather_file), df_struct)
+
+  if (arrow_with_parquet()) {
+    parquet_file <- tempfile()
+    write_parquet(df, parquet_file) # Parquet doesn't yet support nested types
+    expect_identical(read_parquet(parquet_file), df)
+  }
 })
diff --git a/r/vignettes/developers/workflow.Rmd b/r/vignettes/developers/workflow.Rmd
index b98e82e7b4..b7e0a27d76 100644
--- a/r/vignettes/developers/workflow.Rmd
+++ b/r/vignettes/developers/workflow.Rmd
@@ -71,8 +71,8 @@ make style-all # (for all files)
 or in R:
 
 ```r
-# note the two excluded files which should not be styled
-styler::style_pkg(exclude_files = c("tests/testthat/latin1.R", "data-raw/codegen.R"))
+# note the file that should not be styled
+styler::style_pkg(exclude_files = c("data-raw/codegen.R"))
 ```
 
 The styler package will fix many styling errors, thought not all lintr errors are automatically fixable with styler. The list of files we intentionally do not style is in `r/.styler_excludes.R`.