You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@arrow.apache.org by "Andy Teucher (Jira)" <ji...@apache.org> on 2022/03/22 19:08:00 UTC

[jira] [Updated] (ARROW-16007) [R] binding for grepl has different behaviour with NA compared to R base grepl

     [ https://issues.apache.org/jira/browse/ARROW-16007?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Andy Teucher updated ARROW-16007:
---------------------------------
    Description: 
The arrow binding to {{grepl}} behaves slightly differently than the base R {{{}grepl{}}}, in that it returns {{NA}} for {{NA}} inputs, whereas base {{grepl}} returns {{{}FALSE with \{{NA}}inputs. arrow's implementention is consistent with stringr::str_detect(){}}}, and both {{str_detect()}} and {{grepl()}} are bound to {{match_substring_regex}} and {{match_substring}} in arrow.

I don't know if this is something you would want to change so that the {{grepl}} behaviour aligns with base {{{}grepl{}}}, or simply document this difference?

Reprex:
 
{code:r}
library(arrow, warn.conflicts = FALSE, quietly = TRUE)
library(dplyr, warn.conflicts = FALSE, quietly = TRUE)
library(stringr, quietly = TRUE)

alpha_df <- data.frame(alpha = c("alpha", "bet", NA_character_))
alpha_dataset <- InMemoryDataset$create(alpha_df)

mutate(alpha_df, 
       grepl_is_a = grepl("a", alpha), 
       stringr_is_a = str_detect(alpha, "a"))
#>   alpha grepl_is_a stringr_is_a
#> 1 alpha       TRUE         TRUE
#> 2   bet      FALSE        FALSE
#> 3  <NA>      FALSE           NA

mutate(alpha_dataset, 
       grepl_is_a = grepl("a", alpha), 
       stringr_is_a = str_detect(alpha, "a")) |> 
  collect()
#>   alpha grepl_is_a stringr_is_a
#> 1 alpha       TRUE         TRUE
#> 2   bet      FALSE        FALSE
#> 3  <NA>         NA           NA

# base R grepl returns FALSE for NA
grepl("a", alpha_df$alpha) # bound to arrow_match_substring_regex
#> [1]  TRUE FALSE FALSE

grepl("a", alpha_df$alpha, fixed = TRUE) # bound to arrow_match_substring
#> [1]  TRUE FALSE FALSE

# stringr::str_dectect returns NA for NA
str_detect(alpha_df$alpha, "a")
#> [1]  TRUE FALSE    NA

alpha_array <- Array$create(alpha_df$alpha)

# arrow functions return null for null (NA)
call_function("match_substring_regex", alpha_array, options = list(pattern = "a"))
#> Array
#> <bool>
#> [
#>   true,
#>   false,
#>   null
#> ]

call_function("match_substring", alpha_array, options = list(pattern = "a"))
#> Array
#> <bool>
#> [
#>   true,
#>   false,
#>   null
#> ]
{code}
 

 

  was:
The arrow binding to {{grepl}} behaves slightly differently than the base R {{{}grepl{}}}, in that it returns {{NA}} for {{NA}} inputs, whereas base {{grepl}} returns {{FALSE }}with{{ NA }}inputs. arrow's implementention is consistent with {{{}stringr::str_detect(){}}}, and both {{str_detect()}} and {{grepl()}} are bound to {{match_substring_regex}} and {{match_substring}} in arrow.

I don't know if this is something you would want to change so that the {{grepl}} behaviour aligns with base {{{}grepl{}}}, or simply document this difference?

Reprex:
 
{code:r}
library(arrow, warn.conflicts = FALSE, quietly = TRUE)
library(dplyr, warn.conflicts = FALSE, quietly = TRUE)
library(stringr, quietly = TRUE)

alpha_df <- data.frame(alpha = c("alpha", "bet", NA_character_))
alpha_dataset <- InMemoryDataset$create(alpha_df)

mutate(alpha_df, 
       grepl_is_a = grepl("a", alpha), 
       stringr_is_a = str_detect(alpha, "a"))
#>   alpha grepl_is_a stringr_is_a
#> 1 alpha       TRUE         TRUE
#> 2   bet      FALSE        FALSE
#> 3  <NA>      FALSE           NA

mutate(alpha_dataset, 
       grepl_is_a = grepl("a", alpha), 
       stringr_is_a = str_detect(alpha, "a")) |> 
  collect()
#>   alpha grepl_is_a stringr_is_a
#> 1 alpha       TRUE         TRUE
#> 2   bet      FALSE        FALSE
#> 3  <NA>         NA           NA

# base R grepl returns FALSE for NA
grepl("a", alpha_df$alpha) # bound to arrow_match_substring_regex
#> [1]  TRUE FALSE FALSE

grepl("a", alpha_df$alpha, fixed = TRUE) # bound to arrow_match_substring
#> [1]  TRUE FALSE FALSE

# stringr::str_dectect returns NA for NA
str_detect(alpha_df$alpha, "a")
#> [1]  TRUE FALSE    NA

alpha_array <- Array$create(alpha_df$alpha)

# arrow functions return null for null (NA)
call_function("match_substring_regex", alpha_array, options = list(pattern = "a"))
#> Array
#> <bool>
#> [
#>   true,
#>   false,
#>   null
#> ]

call_function("match_substring", alpha_array, options = list(pattern = "a"))
#> Array
#> <bool>
#> [
#>   true,
#>   false,
#>   null
#> ]
{code}
 

 


> [R] binding for grepl has different behaviour with NA compared to R base grepl
> ------------------------------------------------------------------------------
>
>                 Key: ARROW-16007
>                 URL: https://issues.apache.org/jira/browse/ARROW-16007
>             Project: Apache Arrow
>          Issue Type: Improvement
>    Affects Versions: 7.0.0
>            Reporter: Andy Teucher
>            Priority: Minor
>
> The arrow binding to {{grepl}} behaves slightly differently than the base R {{{}grepl{}}}, in that it returns {{NA}} for {{NA}} inputs, whereas base {{grepl}} returns {{{}FALSE with \{{NA}}inputs. arrow's implementention is consistent with stringr::str_detect(){}}}, and both {{str_detect()}} and {{grepl()}} are bound to {{match_substring_regex}} and {{match_substring}} in arrow.
> I don't know if this is something you would want to change so that the {{grepl}} behaviour aligns with base {{{}grepl{}}}, or simply document this difference?
> Reprex:
>  
> {code:r}
> library(arrow, warn.conflicts = FALSE, quietly = TRUE)
> library(dplyr, warn.conflicts = FALSE, quietly = TRUE)
> library(stringr, quietly = TRUE)
> alpha_df <- data.frame(alpha = c("alpha", "bet", NA_character_))
> alpha_dataset <- InMemoryDataset$create(alpha_df)
> mutate(alpha_df, 
>        grepl_is_a = grepl("a", alpha), 
>        stringr_is_a = str_detect(alpha, "a"))
> #>   alpha grepl_is_a stringr_is_a
> #> 1 alpha       TRUE         TRUE
> #> 2   bet      FALSE        FALSE
> #> 3  <NA>      FALSE           NA
> mutate(alpha_dataset, 
>        grepl_is_a = grepl("a", alpha), 
>        stringr_is_a = str_detect(alpha, "a")) |> 
>   collect()
> #>   alpha grepl_is_a stringr_is_a
> #> 1 alpha       TRUE         TRUE
> #> 2   bet      FALSE        FALSE
> #> 3  <NA>         NA           NA
> # base R grepl returns FALSE for NA
> grepl("a", alpha_df$alpha) # bound to arrow_match_substring_regex
> #> [1]  TRUE FALSE FALSE
> grepl("a", alpha_df$alpha, fixed = TRUE) # bound to arrow_match_substring
> #> [1]  TRUE FALSE FALSE
> # stringr::str_dectect returns NA for NA
> str_detect(alpha_df$alpha, "a")
> #> [1]  TRUE FALSE    NA
> alpha_array <- Array$create(alpha_df$alpha)
> # arrow functions return null for null (NA)
> call_function("match_substring_regex", alpha_array, options = list(pattern = "a"))
> #> Array
> #> <bool>
> #> [
> #>   true,
> #>   false,
> #>   null
> #> ]
> call_function("match_substring", alpha_array, options = list(pattern = "a"))
> #> Array
> #> <bool>
> #> [
> #>   true,
> #>   false,
> #>   null
> #> ]
> {code}
>  
>  



--
This message was sent by Atlassian Jira
(v8.20.1#820001)