You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ne...@apache.org on 2020/12/23 14:51:34 UTC

[arrow] branch master updated: ARROW-10967: [Rust] Add functions for test data to mod arrow::util::test_util

This is an automated email from the ASF dual-hosted git repository.

nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new be72a2b  ARROW-10967: [Rust] Add functions for test data to mod arrow::util::test_util
be72a2b is described below

commit be72a2bd8b4e1c2cebd4e34f8cd18006a566d55e
Author: mqy <me...@gmail.com>
AuthorDate: Wed Dec 23 16:50:17 2020 +0200

    ARROW-10967: [Rust] Add functions for test data to mod arrow::util::test_util
    
    If we could get test data dirs at runtime,  both env vars `ARROW_TEST_DATA` and `PARQUET_TEST_DATA` become **optional**: no need to set them unless the testing data is not in pre-defined location.
    
    This PR adds two similar public functions `arrow_test_data` and `parquet_test_data` to mod `arrow::util::test_util`, each behaves like this:
    
    - return data dir from user defined env if defined and corresponding dir exists.
    - return default data dir by joining env `CARGO_MANIFEST_DIR` and relative pre-defined data data dirs.
    - panic on error.
    
    Possible panic errors from `arrow_test_data()`:
    
    ```
    - failed to get arrow data dir: the data dir `non/existing` defined by env `ARROW_TEST_DATA` not found
    - failed to get arrow data dir: env `ARROW_TEST_DATA` is undefined or has empty value, and the pre-defined data dir `../../testing/data` not found
    ```
    
    Possible panic errors from `parquet_test_data()`:
    
    ```
    - failed to get parquet data dir: the data dir `non/existing` defined by env `PARQUET_TEST_DATA` not found
    - failed to get parquet data dir: env `PARQUET_TEST_DATA` is undefined or has empty value, and the pre-defined data dir `../../cpp/submodules/parquet-testing/data` not found
    ```
    
    Existing codes can be updated in this way :
    ```
    let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined");
    // change to
    let testdata = arrow::util::test_util::arrow_test_data();
    ```
    
    Closes #8967 from mqy/ARROW-10967_optional_env
    
    Authored-by: mqy <me...@gmail.com>
    Signed-off-by: Neville Dipale <ne...@gmail.com>
---
 rust/arrow/src/util/test_util.rs | 133 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 132 insertions(+), 1 deletion(-)

diff --git a/rust/arrow/src/util/test_util.rs b/rust/arrow/src/util/test_util.rs
index 6a70edd..8c085e2 100644
--- a/rust/arrow/src/util/test_util.rs
+++ b/rust/arrow/src/util/test_util.rs
@@ -18,7 +18,7 @@
 //! Utils to make testing easier
 
 use rand::{rngs::StdRng, Rng, SeedableRng};
-use std::{env, fs, io::Write};
+use std::{env, error::Error, fs, io::Write, path::PathBuf};
 
 /// Returns a vector of size `n`, filled with randomly generated bytes.
 pub fn random_bytes(n: usize) -> Vec<u8> {
@@ -60,3 +60,134 @@ pub fn get_temp_file(file_name: &str, content: &[u8]) -> fs::File {
     assert!(file.is_ok());
     file.unwrap()
 }
+
+/// Gets arrow test data dir, by optional env `ARROW_TEST_DATA` or the default
+/// `../../testing/data`.
+/// It panics when failed to get dir.
+///
+/// Example:
+/// ```
+/// let testdata = arrow::util::test_util::arrow_test_data();
+/// let csvdata = format!("{}/csv/aggregate_test_100.csv", testdata);
+/// assert!(std::path::PathBuf::from(csvdata).exists());
+/// ```
+pub fn arrow_test_data() -> String {
+    match get_data_dir("ARROW_TEST_DATA", "../../testing/data") {
+        Ok(pb) => pb.display().to_string(),
+        Err(err) => panic!(format!("failed to get arrow data dir: {}", err)),
+    }
+}
+
+/// Gets parquet test data dir, by optional env `PARQUET_TEST_DATA` or the default
+/// `../../cpp/submodules/parquet-testing/data`.
+/// It panics when failed to get dir.
+///
+/// Example:
+/// ```
+/// let testdata = arrow::util::test_util::parquet_test_data();
+/// let filename = format!("{}/binary.parquet", testdata);
+/// assert!(std::path::PathBuf::from(filename).exists());
+/// ```
+pub fn parquet_test_data() -> String {
+    match get_data_dir(
+        "PARQUET_TEST_DATA",
+        "../../cpp/submodules/parquet-testing/data",
+    ) {
+        Ok(pb) => pb.display().to_string(),
+        Err(err) => panic!(format!("failed to get parquet data dir: {}", err)),
+    }
+}
+
+/// get_data_dir is the helper function for `arrow_test_data` and `arrow_test_data`.
+fn get_data_dir(udf_env: &str, submodule_data: &str) -> Result<PathBuf, Box<dyn Error>> {
+    // Try user defined env.
+    if let Ok(dir) = env::var(udf_env) {
+        let trimmed = dir.trim().to_string();
+        if !trimmed.is_empty() {
+            let pb = PathBuf::from(trimmed);
+            if pb.is_dir() {
+                return Ok(pb);
+            } else {
+                return Err(format!(
+                    "the data dir `{}` defined by env {} not found",
+                    pb.display().to_string(),
+                    udf_env
+                )
+                .into());
+            }
+        }
+    }
+
+    // The env is undefined or it's value is trimmed to empty, let's try default dir.
+
+    // env "CARGO_MANIFEST_DIR" is "the directory containing the manifest of your package",
+    // set by `cargo run` or `cargo test`, see:
+    // https://doc.rust-lang.org/cargo/reference/environment-variables.html
+    let dir = env!("CARGO_MANIFEST_DIR");
+
+    let pb = PathBuf::from(dir).join(submodule_data);
+    if pb.is_dir() {
+        Ok(pb)
+    } else {
+        Err(format!(
+            "env `{}` is undefined or has empty value, and the pre-defined data dir `{}` not found", 
+            udf_env,
+            pb.display().to_string(),
+        ).into())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::env;
+
+    #[test]
+    fn test_data_dir() {
+        let udf_env = "get_data_dir";
+        let cwd = env::current_dir().unwrap();
+
+        let existing_pb = cwd.join("..");
+        let existing = existing_pb.display().to_string();
+        let existing_str = existing.as_str();
+
+        let non_existing = cwd.join("non-existing-dir").display().to_string();
+        let non_existing_str = non_existing.as_str();
+
+        env::set_var(udf_env, non_existing_str);
+        let res = get_data_dir(udf_env, existing_str);
+        assert!(res.is_err());
+
+        env::set_var(udf_env, "");
+        let res = get_data_dir(udf_env, existing_str);
+        assert!(res.is_ok());
+        assert_eq!(res.unwrap(), existing_pb);
+
+        env::set_var(udf_env, " ");
+        let res = get_data_dir(udf_env, existing_str);
+        assert!(res.is_ok());
+        assert_eq!(res.unwrap(), existing_pb);
+
+        env::set_var(udf_env, existing_str);
+        let res = get_data_dir(udf_env, existing_str);
+        assert!(res.is_ok());
+        assert_eq!(res.unwrap(), existing_pb);
+
+        env::remove_var(udf_env);
+        let res = get_data_dir(udf_env, non_existing_str);
+        assert!(res.is_err());
+
+        let res = get_data_dir(udf_env, existing_str);
+        assert!(res.is_ok());
+        assert_eq!(res.unwrap(), existing_pb);
+    }
+
+    #[test]
+    fn test_happy() {
+        let res = arrow_test_data();
+        assert!(PathBuf::from(res).is_dir());
+
+        let res = parquet_test_data();
+        assert!(PathBuf::from(res).is_dir());
+    }
+}