You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/11 14:21:40 UTC

[GitHub] [arrow] assignUser commented on a diff in pull request #12849: ARROW-15092: [R] Support create_package_with_all_dependencies() on non-linux systems

assignUser commented on code in PR #12849:
URL: https://github.com/apache/arrow/pull/12849#discussion_r847372973


##########
r/R/install-arrow.R:
##########
@@ -137,6 +137,262 @@ reload_arrow <- function() {
   }
 }
 
+# Substitute like the bash shell
+#
+# @param one_string A length-1 character vector
+# @param possible_values A dictionary-ish set of variables that could provide
+#   values to substitute in.
+# @return `one_string`, with values substituted like bash would.
+#
+# Only supports a small subset of bash substitution patterns. May have multiple
+# bash variables in `one_string`
+# Used as a helper to parse versions.txt
+..install_substitute_like_bash <- function(one_string, possible_values) {

Review Comment:
   I think it is not necessary to use the dot-prefix for internal functions as it is done nowhere else in the package. Nice job keeping a consistent naming scheme though! 👍 



##########
r/R/install-arrow.R:
##########
@@ -137,6 +137,262 @@ reload_arrow <- function() {
   }
 }
 
+# Substitute like the bash shell
+#
+# @param one_string A length-1 character vector
+# @param possible_values A dictionary-ish set of variables that could provide
+#   values to substitute in.
+# @return `one_string`, with values substituted like bash would.
+#
+# Only supports a small subset of bash substitution patterns. May have multiple
+# bash variables in `one_string`
+# Used as a helper to parse versions.txt
+..install_substitute_like_bash <- function(one_string, possible_values) {
+  stopifnot(
+    !is.null(names(possible_values)),
+    !any(names(possible_values) == ""),
+    anyDuplicated(names(possible_values)) == 0,
+    length(one_string) == 1,
+    !anyNA(possible_values)
+  )
+  # Find the name of the version we want, something like
+  # ARROW_RAPIDJSON_BUILD_VERSION
+  # The `(//./_|:1)` is a special case to handle some bash fanciness.
+  version_regex <- "\\$\\{(ARROW_[A-Z0-9_]+?_VERSION)(//./_|:1)?\\}"
+  # Extract the matched groups. If the pattern occurs multiple times, we need
+  # all (non-overlapping) matches. stringr::str_extract_all() or
+  # base::gregexec() would be useful here, but gregexec was only introduced in
+  # R 4.1.
+  matched_substrings <- regmatches(
+    one_string,
+    gregexpr(version_regex, one_string, perl = TRUE)
+  )[[1]] # Subset [[1]] because one_string has length 1
+  # `matched_substrings` is a character vector with length equal to the number
+  # of non-overlapping matches of `version_regex` in `one_string`. `match_list`
+  # is a list (same length as `matched_substrings`), where each list element is
+  # a length-3 character vector. The first element of the vector is the value
+  # from `matched_substrings` (e.g. "${ARROW_ZZZ_VERSION//./_}"). The following
+  #  two values are the captured groups specified in `version_regex` e.g.
+  # "ARROW_ZZZ_VERSION" and "//./_".
+  match_list <- regmatches(
+    matched_substrings,
+    regexec(version_regex, matched_substrings, perl = TRUE)
+  )
+  # Small helper to take slices of match_list
+  extract_chr_by_idx <- function(lst, idx) {
+    vapply(lst, function(x) x[idx], FUN.VALUE = character(1L))
+  }
+  string_to_sub <- extract_chr_by_idx(match_list, 1L)
+  version_varnames <- extract_chr_by_idx(match_list, 2L)
+  bash_special_cases <- extract_chr_by_idx(match_list, 3L)
+  version_values <- possible_values[version_varnames]
+  version_values <- ifelse(
+    bash_special_cases == "", version_values, ifelse(
+      bash_special_cases == ":1", substring(version_values, 2), ifelse(
+        bash_special_cases == "//./_", gsub(".", "_", version_values, fixed = TRUE),
+        NA_character_ # otherwise
+      )
+    )
+  )
+  num_to_sub <- length(string_to_sub)
+  stopifnot(
+    all(version_varnames %in% names(possible_values)),
+    !anyNA(version_values),
+    num_to_sub >= 1,
+    num_to_sub < 10 # Something has gone wrong if we're doing 10+
+  )
+  out <- one_string
+  for (idx in seq_len(num_to_sub)) {
+    # not gsub in case there are duplicates
+    out <- sub(string_to_sub[idx], version_values[idx], out, fixed = TRUE)
+  }
+  out
+}
+
+# Substitute all values in the filenames and URLs of versions.txt
+#
+# @param deps_unsubstituted A list with two elements, `filenames` and `urls`
+# @param possible_values A dictionary-ish set of variables that could provide
+#   values to substitute in.
+# @return A list with two elements, `filenames` and `urls`, with values
+#   substituted into the strings like bash would.
+#
+# Used as a helper to parse versions.txt
+..install_substitute_all <- function(deps_unsubstituted, possible_values) {
+  file_substituted <- vapply(
+    deps_unsubstituted$filenames,
+    ..install_substitute_like_bash,
+    FUN.VALUE = character(1),
+    possible_values = possible_values
+  )
+  url_substituted <- vapply(
+    deps_unsubstituted$urls,
+    ..install_substitute_like_bash,
+    FUN.VALUE = character(1),
+    possible_values = possible_values
+  )
+  list(
+    filenames = unname(file_substituted),
+    urls = unname(url_substituted)
+  )
+}
+
+# Parse the version lines portion of versions.txt
+#
+# @param version_lines A character vector of lines read from versions.txt
+# @return The parsed and bash-substiuted version values
+#
+# Used as a helper to parse versions.txt
+..install_parse_version_lines <- function(version_lines) {
+  version_lines <- trimws(version_lines)
+  version_regex <- "^(ARROW_[A-Z0-9_]+_)(VERSION|SHA256_CHECKSUM)=([^=]+)$"
+  if (!all(grepl(version_regex, version_lines, perl = TRUE))) {
+    stop("Failed to parse version lines")
+  }
+  match_list <- regmatches(
+    version_lines,
+    regexec(version_regex, version_lines, perl = TRUE)
+  )
+  # Find the lines where the second regex match group is that are "VERSION" (as
+  # opposed to "SHA256_CHECKSUM")
+  version_idx <- vapply(
+    match_list,
+    function(m) m[[3]] == "VERSION",
+    FUN.VALUE = logical(1)
+  )
+  version_matches <- match_list[version_idx]
+  # Fancy indexing here is just to pull the first and second regex match out,
+  # e.g. "ARROW_RAPIDJSON_BUILD_" and "VERSION"
+  version_varnames <- vapply(
+    version_matches,
+    function(m) paste0(m[[2]], m[[3]]),
+    FUN.VALUE = character(1)
+  )
+  version_values <- vapply(version_matches,
+    function(m) m[[4]],
+    FUN.VALUE = character(1)
+  )
+  names(version_values) <- version_varnames
+  return(version_values)
+}
+
+# Parse the URL + filename array portion of versions.txt
+#
+# @param array_lines Characer vector of lines from the versions.txt file with
+#   the filename and URL array
+# @return A list with two character vectors, with names `filenames` and `urls`
+#
+# The output of this function has split out the filename and URL components,
+# but has not yet substituted in the version numbers. The output is next passed
+# to `..install_substitute_all()`
+#
+# Used as a helper to parse versions.txt
+..install_parse_dependency_array <- function(array_lines) {
+  stopifnot(
+    length(array_lines) >= 1,
+    is.character(array_lines),
+    !anyNA(array_lines)
+  )
+  array_lines <- trimws(array_lines)
+
+  # Parse the array_lines with a regex. Each line of the array is a different
+  # component, with a format like `"<component name> <filename> <url>"` (quotes
+  # included). The filename and URL include some version string that's defined
+  # earlier in the file.
+  # Regex in words:
+  # Start with `"ARROW_`, then any capital ASCII letter, number, or underscore.
+  # After a space, find anything except a space, colon, or forward slash. (No
+  # space is essential, and would be essential to parsing the array in bash.
+  # The colon and slash are just basic guards that this is a filename.) Next, a
+  # space. Then a URL, starting with https://, and including anything except a
+  # space. (This is the URL before substituting in the version sting, so normal
+  # URL parsing rules don't apply.)
+  dep_array_regex <- '^"(ARROW_[A-Z0-9_]+_URL) ([^ :/"]+) (https://[^ "]+)"$'
+  if (!all(grepl(dep_array_regex, array_lines, perl = TRUE))) {
+    stop("Cannot parse thirdparty dependency array in expected format.")
+  }
+  list(
+    filenames = gsub(dep_array_regex, "\\2", array_lines, perl = TRUE),
+    urls      = gsub(dep_array_regex, "\\3", array_lines, perl = TRUE)
+  )
+}
+
+# Parse the versions.txt file
+#
+# @param versions_file Filename pointing to versions.txt
+# @return The parsed and ready-to-use values, as a named list of vectors
+#
+#
+# The versions.txt file is included as part of the R tar file, and is here:
+# https://github.com/apache/arrow/blob/master/cpp/thirdparty/versions.txt
+#
+# Used as a helper to parse versions.txt
+..install_parse_lines <- function(versions_file) {
+  orig_lines <- readLines(versions_file)
+
+  lines <- gsub("#.*", "", orig_lines, perl = TRUE)
+  lines <- lines[lines != ""]
+
+  dep_array_start_idx <- grep("^DEPENDENCIES=\\($", lines, perl = TRUE)
+  dep_array_lines <- lines[
+    seq.int(from = dep_array_start_idx + 1, to = length(lines) - 1, by = 1)
+  ]
+  version_lines <- lines[seq.int(1, dep_array_start_idx - 1, by = 1)]
+  version_info <- ..install_parse_version_lines(version_lines)
+
+  failed_to_parse <- c(
+    anyNA(orig_lines),
+    length(orig_lines) > 1000,
+    length(lines) == 0,
+    length(dep_array_start_idx) != 1,
+    dep_array_start_idx <= 1,
+    dep_array_start_idx >= length(lines) - 3,
+    lines[length(lines)] != ")",
+    length(dep_array_lines) == 0,
+    anyNA(version_info)
+  )
+
+  if (any(failed_to_parse)) {
+    stop(

Review Comment:
   You could use `rlang::abort` here but that is very heterogenous in the package so 🤷 (and you aimed for base R which is good!) 



##########
r/R/install-arrow.R:
##########
@@ -137,6 +137,262 @@ reload_arrow <- function() {
   }
 }
 
+# Substitute like the bash shell
+#
+# @param one_string A length-1 character vector
+# @param possible_values A dictionary-ish set of variables that could provide
+#   values to substitute in.
+# @return `one_string`, with values substituted like bash would.
+#
+# Only supports a small subset of bash substitution patterns. May have multiple
+# bash variables in `one_string`
+# Used as a helper to parse versions.txt
+..install_substitute_like_bash <- function(one_string, possible_values) {
+  stopifnot(
+    !is.null(names(possible_values)),
+    !any(names(possible_values) == ""),
+    anyDuplicated(names(possible_values)) == 0,
+    length(one_string) == 1,
+    !anyNA(possible_values)
+  )
+  # Find the name of the version we want, something like
+  # ARROW_RAPIDJSON_BUILD_VERSION
+  # The `(//./_|:1)` is a special case to handle some bash fanciness.
+  version_regex <- "\\$\\{(ARROW_[A-Z0-9_]+?_VERSION)(//./_|:1)?\\}"
+  # Extract the matched groups. If the pattern occurs multiple times, we need
+  # all (non-overlapping) matches. stringr::str_extract_all() or
+  # base::gregexec() would be useful here, but gregexec was only introduced in
+  # R 4.1.
+  matched_substrings <- regmatches(
+    one_string,
+    gregexpr(version_regex, one_string, perl = TRUE)
+  )[[1]] # Subset [[1]] because one_string has length 1
+  # `matched_substrings` is a character vector with length equal to the number
+  # of non-overlapping matches of `version_regex` in `one_string`. `match_list`
+  # is a list (same length as `matched_substrings`), where each list element is
+  # a length-3 character vector. The first element of the vector is the value
+  # from `matched_substrings` (e.g. "${ARROW_ZZZ_VERSION//./_}"). The following
+  #  two values are the captured groups specified in `version_regex` e.g.
+  # "ARROW_ZZZ_VERSION" and "//./_".

Review Comment:
   👍 



##########
r/R/install-arrow.R:
##########
@@ -137,6 +137,262 @@ reload_arrow <- function() {
   }
 }
 
+# Substitute like the bash shell
+#
+# @param one_string A length-1 character vector
+# @param possible_values A dictionary-ish set of variables that could provide
+#   values to substitute in.
+# @return `one_string`, with values substituted like bash would.
+#
+# Only supports a small subset of bash substitution patterns. May have multiple
+# bash variables in `one_string`
+# Used as a helper to parse versions.txt
+..install_substitute_like_bash <- function(one_string, possible_values) {

Review Comment:
   > I wrote some roxygen-like documentation, but didn't actually want to generate the `*.Rd` files, so just started the lines with `#` instead of `#'`.
   
   You can prevent the creation of an Rd file by using `@noRd` .



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org