You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@arrow.apache.org by "Dewey Dunnington (Jira)" <ji...@apache.org> on 2021/12/17 15:22:00 UTC

[jira] [Commented] (ARROW-14744) [R] open_dataset() error when `schema` argument supplied, but `column_names` not supplied to `CSVReadOptions`

    [ https://issues.apache.org/jira/browse/ARROW-14744?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17461512#comment-17461512 ] 

Dewey Dunnington commented on ARROW-14744:
------------------------------------------

Full reprex:

{code:R}
library(arrow, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)

diamond_schema <- Table$create(
  ggplot2::diamonds %>% mutate_if(is.factor, as.character)
)$schema

td <- tempfile()
dir.create(td)

readr::write_csv(
  ggplot2::diamonds, file=file.path(td, 'diamonds.csv'),
  col_names = FALSE
)

open_dataset(
  td,
  format='csv',
  schema = diamond_schema,
  skip_rows = 1,
  read_options=arrow::CsvReadOptions$create(
    skip_rows = 1
    # ..and no column_names
  )
) %>%
  collect()
#> # A tibble: 53,938 × 10
#>    carat cut   color clarity depth table price     x     y     z
#>    <dbl> <chr> <chr> <chr>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
#>  1    NA <NA>  <NA>  <NA>       NA    NA    NA    NA    NA    NA
#>  2    NA <NA>  <NA>  <NA>       NA    NA    NA    NA    NA    NA
#>  3    NA <NA>  <NA>  <NA>       NA    NA    NA    NA    NA    NA
#>  4    NA <NA>  <NA>  <NA>       NA    NA    NA    NA    NA    NA
#>  5    NA <NA>  <NA>  <NA>       NA    NA    NA    NA    NA    NA
#>  6    NA <NA>  <NA>  <NA>       NA    NA    NA    NA    NA    NA
#>  7    NA <NA>  <NA>  <NA>       NA    NA    NA    NA    NA    NA
#>  8    NA <NA>  <NA>  <NA>       NA    NA    NA    NA    NA    NA
#>  9    NA <NA>  <NA>  <NA>       NA    NA    NA    NA    NA    NA
#> 10    NA <NA>  <NA>  <NA>       NA    NA    NA    NA    NA    NA
#> # … with 53,928 more rows

# ...but works with column_names exists
open_dataset(
  td,
  format = 'csv',
  schema = diamond_schema,
  partitioning = NULL,
  skip_rows = 1,
  unify_schemas = FALSE,
  read_options = arrow::CsvReadOptions$create(
    skip_rows = 1,
    column_names = names(diamond_schema)
  )
) %>%
  collect()
#> # A tibble: 53,939 × 10
#>    carat cut       color clarity depth table price     x     y     z
#>    <dbl> <chr>     <chr> <chr>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
#>  1  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
#>  2  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
#>  3  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
#>  4  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
#>  5  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
#>  6  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
#>  7  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
#>  8  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
#>  9  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
#> 10  0.3  Good      J     SI1      64      55   339  4.25  4.28  2.73
#> # … with 53,929 more rows
{code}


> [R] open_dataset() error when `schema` argument supplied, but `column_names` not supplied to `CSVReadOptions`
> -------------------------------------------------------------------------------------------------------------
>
>                 Key: ARROW-14744
>                 URL: https://issues.apache.org/jira/browse/ARROW-14744
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: R
>            Reporter: Nicola Crane
>            Priority: Major
>             Fix For: 7.0.0
>
>
> Note: this occurs regardless of whether the data has a header or not
> {code:r}
> td <- tempfile()
> dir.create(td)
> readr::write_csv(ggplot2::diamonds, file=file.path(td, 'diamonds.csv'), col_names=FALSE)
> readLines(file.path(td, "diamonds.csv"), n = 2)
> open_dataset(
>   td,
>   format = 'csv',
>   schema = diamond_schema,
>   partitioning = NULL,
>   skip_rows = 1,
>   unify_schemas = FALSE,
>   read_options = arrow::CsvReadOptions$create(
>     skip_rows = 1,
>     column_names = names(diamond_schema)
>   )
> ) %>%
>   collect()
> # # A tibble: 53,939 × 10
> #    carat cut       color clarity depth table price     x     y     z
> #    <dbl> <chr>     <chr> <chr>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
> #  1  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
> #  2  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
> #  3  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
> #  4  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
> #  5  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
> #  6  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
> #  7  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
> #  8  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
> #  9  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
> # 10  0.3  Good      J     SI1      64      55   339  4.25  4.28  2.73
> # # … with 53,929 more rows
> open_dataset(
>   td,
>   format='csv',
>   schema = diamond_schema,
>   skip_rows = 1,
>   read_options=arrow::CsvReadOptions$create(skip_rows=1)) %>%
>   collect()
> # # A tibble: 53,938 × 10
> #    carat cut   color clarity depth table price     x     y     z
> #    <dbl> <chr> <chr> <chr>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
> #  1    NA NA    NA    NA         NA    NA    NA    NA    NA    NA
> #  2    NA NA    NA    NA         NA    NA    NA    NA    NA    NA
> #  3    NA NA    NA    NA         NA    NA    NA    NA    NA    NA
> #  4    NA NA    NA    NA         NA    NA    NA    NA    NA    NA
> #  5    NA NA    NA    NA         NA    NA    NA    NA    NA    NA
> #  6    NA NA    NA    NA         NA    NA    NA    NA    NA    NA
> #  7    NA NA    NA    NA         NA    NA    NA    NA    NA    NA
> #  8    NA NA    NA    NA         NA    NA    NA    NA    NA    NA
> #  9    NA NA    NA    NA         NA    NA    NA    NA    NA    NA
> # 10    NA NA    NA    NA         NA    NA    NA    NA    NA    NA
> # # … with 53,928 more rows
> {code}



--
This message was sent by Atlassian Jira
(v8.20.1#820001)