You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/12/23 12:36:21 UTC

[GitHub] [arrow] alamb commented on a change in pull request #8967: ARROW-10967: [Rust] Add functions for test data to mod arrow::util::test_util

alamb commented on a change in pull request #8967:
URL: https://github.com/apache/arrow/pull/8967#discussion_r547916595



##########
File path: rust/arrow/src/util/test_util.rs
##########
@@ -60,3 +60,134 @@ pub fn get_temp_file(file_name: &str, content: &[u8]) -> fs::File {
     assert!(file.is_ok());
     file.unwrap()
 }
+
+/// Gets arrow test data dir, by optional env `ARROW_TEST_DATA` or the default
+/// `../../testing/data`.
+/// It panics when failed to get dir.
+///
+/// Example:
+/// ```
+/// let testdata = arrow::util::test_util::arrow_test_data();
+/// let csvdata = format!("{}/csv/aggregate_test_100.csv", testdata);
+/// assert!(std::path::PathBuf::from(csvdata).exists());
+/// ```
+pub fn arrow_test_data() -> String {
+    match get_data_dir("ARROW_TEST_DATA", "../../testing/data") {
+        Ok(pb) => pb.display().to_string(),
+        Err(err) => panic!(format!("failed to get arrow data dir: {}", err)),
+    }
+}
+
+/// Gets parquet test data dir, by optional env `PARQUET_TEST_DATA` or the default
+/// `../../cpp/submodules/parquet-testing/data`.
+/// It panics when failed to get dir.
+///
+/// Example:
+/// ```
+/// let testdata = arrow::util::test_util::parquet_test_data();
+/// let filename = format!("{}/binary.parquet", testdata);
+/// assert!(std::path::PathBuf::from(filename).exists());
+/// ```
+pub fn parquet_test_data() -> String {
+    match get_data_dir(
+        "PARQUET_TEST_DATA",
+        "../../cpp/submodules/parquet-testing/data",
+    ) {
+        Ok(pb) => pb.display().to_string(),
+        Err(err) => panic!(format!("failed to get parquet data dir: {}", err)),
+    }
+}
+
+/// get_data_dir is the helper function for `arrow_test_data` and `arrow_test_data`.
+fn get_data_dir(udf_env: &str, submodule_data: &str) -> Result<PathBuf, Box<dyn Error>> {
+    // Try user defined env.
+    if let Ok(dir) = env::var(udf_env) {
+        let trimmed = dir.trim().to_string();
+        if !trimmed.is_empty() {
+            let pb = PathBuf::from(trimmed);
+            if pb.is_dir() {
+                return Ok(pb);
+            } else {
+                return Err(format!(
+                    "the data dir `{}` defined by env {} not found",
+                    pb.display().to_string(),
+                    udf_env
+                )
+                .into());
+            }
+        }
+    }
+
+    // The env is undefined or it's value is trimmed to empty, let's try default dir.
+
+    // env "CARGO_MANIFEST_DIR" is "the directory containing the manifest of your package",
+    // set by `cargo run` or `cargo test`, see:
+    // https://doc.rust-lang.org/cargo/reference/environment-variables.html
+    let dir = env!("CARGO_MANIFEST_DIR");
+
+    let pb = PathBuf::from(dir).join(submodule_data);
+    if pb.is_dir() {
+        Ok(pb)
+    } else {
+        Err(format!(
+            "env `{}` is undefined or has empty value, and the pre-defined data dir `{}` not found", 
+            udf_env,
+            pb.display().to_string(),
+        ).into())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::env;
+
+    #[test]
+    fn test_data_dir() {
+        let udf_env = "get_data_dir";
+        let cwd = env::current_dir().unwrap();
+
+        let existing_pb = cwd.join("..");
+        let existing = existing_pb.display().to_string();
+        let existing_str = existing.as_str();
+
+        let non_existing = cwd.join("non-existing-dir").display().to_string();
+        let non_existing_str = non_existing.as_str();
+
+        env::set_var(udf_env, non_existing_str);
+        let res = get_data_dir(udf_env, existing_str);
+        debug_assert!(res.is_err());

Review comment:
       I would personally recommend using `assert!` here and below rather than debug assert -- but it probably makes no practical difference since we would run these tests only on debug builds.
   
   ```suggestion
           debug!(res.is_err());
   ```

##########
File path: rust/arrow/src/util/test_util.rs
##########
@@ -60,3 +60,134 @@ pub fn get_temp_file(file_name: &str, content: &[u8]) -> fs::File {
     assert!(file.is_ok());
     file.unwrap()
 }
+
+/// Gets arrow test data dir, by optional env `ARROW_TEST_DATA` or the default
+/// `../../testing/data`.
+/// It panics when failed to get dir.
+///
+/// Example:
+/// ```
+/// let testdata = arrow::util::test_util::arrow_test_data();
+/// let csvdata = format!("{}/csv/aggregate_test_100.csv", testdata);
+/// assert!(std::path::PathBuf::from(csvdata).exists());
+/// ```
+pub fn arrow_test_data() -> String {
+    match get_data_dir("ARROW_TEST_DATA", "../../testing/data") {
+        Ok(pb) => pb.display().to_string(),
+        Err(err) => panic!(format!("failed to get arrow data dir: {}", err)),
+    }
+}
+
+/// Gets parquet test data dir, by optional env `PARQUET_TEST_DATA` or the default
+/// `../../cpp/submodules/parquet-testing/data`.
+/// It panics when failed to get dir.
+///
+/// Example:
+/// ```
+/// let testdata = arrow::util::test_util::parquet_test_data();
+/// let filename = format!("{}/binary.parquet", testdata);
+/// assert!(std::path::PathBuf::from(filename).exists());
+/// ```
+pub fn parquet_test_data() -> String {
+    match get_data_dir(
+        "PARQUET_TEST_DATA",
+        "../../cpp/submodules/parquet-testing/data",
+    ) {
+        Ok(pb) => pb.display().to_string(),
+        Err(err) => panic!(format!("failed to get parquet data dir: {}", err)),
+    }
+}
+
+/// get_data_dir is the helper function for `arrow_test_data` and `arrow_test_data`.
+fn get_data_dir(udf_env: &str, submodule_data: &str) -> Result<PathBuf, Box<dyn Error>> {

Review comment:
       ```suggestion
   /// Returns a directory path for finding test data. 
   ///
   /// udf_env: name of an environment variable
   ///
   /// submodule_dir: fallback path (relative to CARGO_MANIFEST_DIR)
   /// 
   ///  Returns either:
   /// The path referred to in `udf_env` if that variable is set and refers to a directory
   /// The submodule_data directory relative to CARGO_MANIFEST_PATH (i.e.`$CARGO_MANIFEST_DIR/$submodule_data`)
   fn get_data_dir(udf_env: &str, submodule_data: &str) -> Result<PathBuf, Box<dyn Error>> {
   ```




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org