You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@arrow.apache.org by "Lorenzo Isella (Jira)" <ji...@apache.org> on 2022/10/31 15:11:00 UTC

[jira] [Updated] (ARROW-18202) Gsub does not work properly

     [ https://issues.apache.org/jira/browse/ARROW-18202?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Lorenzo Isella updated ARROW-18202:
-----------------------------------
    Description: 
Hello,

I think there is a problem with arrow 10.0 and R. I did not have this issue with arrow 9.0.

Could you please have a look?

Many thanks

 

 

#################################################################

 

 

library(tidyverse)
library(arrow)
#> 
#> Attaching package: 'arrow'
#> The following object is masked from 'package:utils':
#> 
#>     timestamp

ll <- c(      "1000000",   "10000000",  "2000000"  , "30000000" , "500000"   ,
        "5000000", ""   ,   "Not Range")


df <- tibble(x=rep(ll, 1000), y=seq(8000))

 


write_tsv(df, "data.tsv")

data <- open_dataset("data.tsv", format="tsv",
                     skip_rows=1,
                     schema=schema(x=string(),
                     y=double())
)

test <- data |>
    collect()

###I want to replace the "" with "0". I believe this worked with arrow 9.0

df2 <- data |>
    mutate(x=gsub("^$","0",x) ) |>
    collect()


df2 ### now I did not modify the  "" entries in x
#> # A tibble: 8,000 × 2
#>    x               y
#>    <chr>       <dbl>
#>  1 "1000000"       1
#>  2 "10000000"      2
#>  3 "2000000"       3
#>  4 "30000000"      4
#>  5 "500000"        5
#>  6 "5000000"       6
#>  7 ""              7
#>  8 "Not Range"     8
#>  9 "1000000"       9
#> 10 "10000000"     10
#> # … with 7,990 more rows

 


df3 <- df |>
    mutate(x=gsub("^$","0",x) )

df3  ## and this is fine
#> # A tibble: 8,000 × 2
#>    x             y
#>    <chr>     <int>
#>  1 1000000       1
#>  2 10000000      2
#>  3 2000000       3
#>  4 30000000      4
#>  5 500000        5
#>  6 5000000       6
#>  7 0             7
#>  8 Not Range     8
#>  9 1000000       9
#> 10 10000000     10
#> # … with 7,990 more rows

## How to fix this...I believe this issue did not arise with arrow 9.0.

sessionInfo()
#> R version 4.2.1 (2022-06-23)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Debian GNU/Linux 11 (bullseye)
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
#>  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
#>  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] arrow_10.0.0    forcats_0.5.2   stringr_1.4.1   dplyr_1.0.10   
#>  [5] purrr_0.3.5     readr_2.1.3     tidyr_1.2.1     tibble_3.1.8   
#>  [9] ggplot2_3.3.6   tidyverse_1.3.2
#> 
#> loaded via a namespace (and not attached):
#>  [1] lubridate_1.8.0     assertthat_0.2.1    digest_0.6.30      
#>  [4] utf8_1.2.2          R6_2.5.1            cellranger_1.1.0   
#>  [7] backports_1.4.1     reprex_2.0.2        evaluate_0.17      
#> [10] httr_1.4.4          highr_0.9           pillar_1.8.1       
#> [13] rlang_1.0.6         googlesheets4_1.0.1 readxl_1.4.1       
#> [16] R.utils_2.12.1      R.oo_1.25.0         rmarkdown_2.17     
#> [19] styler_1.8.0        googledrive_2.0.0   bit_4.0.4          
#> [22] munsell_0.5.0       broom_1.0.1         compiler_4.2.1     
#> [25] modelr_0.1.9        xfun_0.34           pkgconfig_2.0.3    
#> [28] htmltools_0.5.3     tidyselect_1.2.0    fansi_1.0.3        
#> [31] crayon_1.5.2        tzdb_0.3.0          dbplyr_2.2.1       
#> [34] withr_2.5.0         R.methodsS3_1.8.2   grid_4.2.1         
#> [37] jsonlite_1.8.3      gtable_0.3.1        lifecycle_1.0.3    
#> [40] DBI_1.1.3           magrittr_2.0.3      scales_1.2.1       
#> [43] vroom_1.6.0         cli_3.4.1           stringi_1.7.8      
#> [46] fs_1.5.2            xml2_1.3.3          ellipsis_0.3.2     
#> [49] generics_0.1.3      vctrs_0.5.0         tools_4.2.1        
#> [52] bit64_4.0.5         R.cache_0.16.0      glue_1.6.2         
#> [55] hms_1.1.2           parallel_4.2.1      fastmap_1.1.0      
#> [58] yaml_2.3.6          colorspace_2.0-3    gargle_1.2.1       
#> [61] rvest_1.0.3         knitr_1.40          haven_2.5.1

> Gsub does not work properly
> ---------------------------
>
>                 Key: ARROW-18202
>                 URL: https://issues.apache.org/jira/browse/ARROW-18202
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: R
>            Reporter: Lorenzo Isella
>            Priority: Major
>
> Hello,
> I think there is a problem with arrow 10.0 and R. I did not have this issue with arrow 9.0.
> Could you please have a look?
> Many thanks
>  
>  
> #################################################################
>  
>  
> library(tidyverse)
> library(arrow)
> #> 
> #> Attaching package: 'arrow'
> #> The following object is masked from 'package:utils':
> #> 
> #>     timestamp
> ll <- c(      "1000000",   "10000000",  "2000000"  , "30000000" , "500000"   ,
>         "5000000", ""   ,   "Not Range")
> df <- tibble(x=rep(ll, 1000), y=seq(8000))
>  
> write_tsv(df, "data.tsv")
> data <- open_dataset("data.tsv", format="tsv",
>                      skip_rows=1,
>                      schema=schema(x=string(),
>                      y=double())
> )
> test <- data |>
>     collect()
> ###I want to replace the "" with "0". I believe this worked with arrow 9.0
> df2 <- data |>
>     mutate(x=gsub("^$","0",x) ) |>
>     collect()
> df2 ### now I did not modify the  "" entries in x
> #> # A tibble: 8,000 × 2
> #>    x               y
> #>    <chr>       <dbl>
> #>  1 "1000000"       1
> #>  2 "10000000"      2
> #>  3 "2000000"       3
> #>  4 "30000000"      4
> #>  5 "500000"        5
> #>  6 "5000000"       6
> #>  7 ""              7
> #>  8 "Not Range"     8
> #>  9 "1000000"       9
> #> 10 "10000000"     10
> #> # … with 7,990 more rows
>  
> df3 <- df |>
>     mutate(x=gsub("^$","0",x) )
> df3  ## and this is fine
> #> # A tibble: 8,000 × 2
> #>    x             y
> #>    <chr>     <int>
> #>  1 1000000       1
> #>  2 10000000      2
> #>  3 2000000       3
> #>  4 30000000      4
> #>  5 500000        5
> #>  6 5000000       6
> #>  7 0             7
> #>  8 Not Range     8
> #>  9 1000000       9
> #> 10 10000000     10
> #> # … with 7,990 more rows
> ## How to fix this...I believe this issue did not arise with arrow 9.0.
> sessionInfo()
> #> R version 4.2.1 (2022-06-23)
> #> Platform: x86_64-pc-linux-gnu (64-bit)
> #> Running under: Debian GNU/Linux 11 (bullseye)
> #> 
> #> Matrix products: default
> #> BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
> #> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
> #> 
> #> locale:
> #>  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
> #>  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
> #>  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
> #>  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
> #>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
> #> [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
> #> 
> #> attached base packages:
> #> [1] stats     graphics  grDevices utils     datasets  methods   base     
> #> 
> #> other attached packages:
> #>  [1] arrow_10.0.0    forcats_0.5.2   stringr_1.4.1   dplyr_1.0.10   
> #>  [5] purrr_0.3.5     readr_2.1.3     tidyr_1.2.1     tibble_3.1.8   
> #>  [9] ggplot2_3.3.6   tidyverse_1.3.2
> #> 
> #> loaded via a namespace (and not attached):
> #>  [1] lubridate_1.8.0     assertthat_0.2.1    digest_0.6.30      
> #>  [4] utf8_1.2.2          R6_2.5.1            cellranger_1.1.0   
> #>  [7] backports_1.4.1     reprex_2.0.2        evaluate_0.17      
> #> [10] httr_1.4.4          highr_0.9           pillar_1.8.1       
> #> [13] rlang_1.0.6         googlesheets4_1.0.1 readxl_1.4.1       
> #> [16] R.utils_2.12.1      R.oo_1.25.0         rmarkdown_2.17     
> #> [19] styler_1.8.0        googledrive_2.0.0   bit_4.0.4          
> #> [22] munsell_0.5.0       broom_1.0.1         compiler_4.2.1     
> #> [25] modelr_0.1.9        xfun_0.34           pkgconfig_2.0.3    
> #> [28] htmltools_0.5.3     tidyselect_1.2.0    fansi_1.0.3        
> #> [31] crayon_1.5.2        tzdb_0.3.0          dbplyr_2.2.1       
> #> [34] withr_2.5.0         R.methodsS3_1.8.2   grid_4.2.1         
> #> [37] jsonlite_1.8.3      gtable_0.3.1        lifecycle_1.0.3    
> #> [40] DBI_1.1.3           magrittr_2.0.3      scales_1.2.1       
> #> [43] vroom_1.6.0         cli_3.4.1           stringi_1.7.8      
> #> [46] fs_1.5.2            xml2_1.3.3          ellipsis_0.3.2     
> #> [49] generics_0.1.3      vctrs_0.5.0         tools_4.2.1        
> #> [52] bit64_4.0.5         R.cache_0.16.0      glue_1.6.2         
> #> [55] hms_1.1.2           parallel_4.2.1      fastmap_1.1.0      
> #> [58] yaml_2.3.6          colorspace_2.0-3    gargle_1.2.1       
> #> [61] rvest_1.0.3         knitr_1.40          haven_2.5.1



--
This message was sent by Atlassian Jira
(v8.20.10#820010)