You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ne...@apache.org on 2020/12/23 14:51:34 UTC
[arrow] branch master updated: ARROW-10967: [Rust] Add functions
for test data to mod arrow::util::test_util
This is an automated email from the ASF dual-hosted git repository.
nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new be72a2b ARROW-10967: [Rust] Add functions for test data to mod arrow::util::test_util
be72a2b is described below
commit be72a2bd8b4e1c2cebd4e34f8cd18006a566d55e
Author: mqy <me...@gmail.com>
AuthorDate: Wed Dec 23 16:50:17 2020 +0200
ARROW-10967: [Rust] Add functions for test data to mod arrow::util::test_util
If we could get test data dirs at runtime, both env vars `ARROW_TEST_DATA` and `PARQUET_TEST_DATA` become **optional**: no need to set them unless the testing data is not in pre-defined location.
This PR adds two similar public functions `arrow_test_data` and `parquet_test_data` to mod `arrow::util::test_util`, each behaves like this:
- return data dir from user defined env if defined and corresponding dir exists.
- return default data dir by joining env `CARGO_MANIFEST_DIR` and relative pre-defined data data dirs.
- panic on error.
Possible panic errors from `arrow_test_data()`:
```
- failed to get arrow data dir: the data dir `non/existing` defined by env `ARROW_TEST_DATA` not found
- failed to get arrow data dir: env `ARROW_TEST_DATA` is undefined or has empty value, and the pre-defined data dir `../../testing/data` not found
```
Possible panic errors from `parquet_test_data()`:
```
- failed to get parquet data dir: the data dir `non/existing` defined by env `PARQUET_TEST_DATA` not found
- failed to get parquet data dir: env `PARQUET_TEST_DATA` is undefined or has empty value, and the pre-defined data dir `../../cpp/submodules/parquet-testing/data` not found
```
Existing codes can be updated in this way :
```
let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined");
// change to
let testdata = arrow::util::test_util::arrow_test_data();
```
Closes #8967 from mqy/ARROW-10967_optional_env
Authored-by: mqy <me...@gmail.com>
Signed-off-by: Neville Dipale <ne...@gmail.com>
---
rust/arrow/src/util/test_util.rs | 133 ++++++++++++++++++++++++++++++++++++++-
1 file changed, 132 insertions(+), 1 deletion(-)
diff --git a/rust/arrow/src/util/test_util.rs b/rust/arrow/src/util/test_util.rs
index 6a70edd..8c085e2 100644
--- a/rust/arrow/src/util/test_util.rs
+++ b/rust/arrow/src/util/test_util.rs
@@ -18,7 +18,7 @@
//! Utils to make testing easier
use rand::{rngs::StdRng, Rng, SeedableRng};
-use std::{env, fs, io::Write};
+use std::{env, error::Error, fs, io::Write, path::PathBuf};
/// Returns a vector of size `n`, filled with randomly generated bytes.
pub fn random_bytes(n: usize) -> Vec<u8> {
@@ -60,3 +60,134 @@ pub fn get_temp_file(file_name: &str, content: &[u8]) -> fs::File {
assert!(file.is_ok());
file.unwrap()
}
+
+/// Gets arrow test data dir, by optional env `ARROW_TEST_DATA` or the default
+/// `../../testing/data`.
+/// It panics when failed to get dir.
+///
+/// Example:
+/// ```
+/// let testdata = arrow::util::test_util::arrow_test_data();
+/// let csvdata = format!("{}/csv/aggregate_test_100.csv", testdata);
+/// assert!(std::path::PathBuf::from(csvdata).exists());
+/// ```
+pub fn arrow_test_data() -> String {
+ match get_data_dir("ARROW_TEST_DATA", "../../testing/data") {
+ Ok(pb) => pb.display().to_string(),
+ Err(err) => panic!(format!("failed to get arrow data dir: {}", err)),
+ }
+}
+
+/// Gets parquet test data dir, by optional env `PARQUET_TEST_DATA` or the default
+/// `../../cpp/submodules/parquet-testing/data`.
+/// It panics when failed to get dir.
+///
+/// Example:
+/// ```
+/// let testdata = arrow::util::test_util::parquet_test_data();
+/// let filename = format!("{}/binary.parquet", testdata);
+/// assert!(std::path::PathBuf::from(filename).exists());
+/// ```
+pub fn parquet_test_data() -> String {
+ match get_data_dir(
+ "PARQUET_TEST_DATA",
+ "../../cpp/submodules/parquet-testing/data",
+ ) {
+ Ok(pb) => pb.display().to_string(),
+ Err(err) => panic!(format!("failed to get parquet data dir: {}", err)),
+ }
+}
+
+/// get_data_dir is the helper function for `arrow_test_data` and `arrow_test_data`.
+fn get_data_dir(udf_env: &str, submodule_data: &str) -> Result<PathBuf, Box<dyn Error>> {
+ // Try user defined env.
+ if let Ok(dir) = env::var(udf_env) {
+ let trimmed = dir.trim().to_string();
+ if !trimmed.is_empty() {
+ let pb = PathBuf::from(trimmed);
+ if pb.is_dir() {
+ return Ok(pb);
+ } else {
+ return Err(format!(
+ "the data dir `{}` defined by env {} not found",
+ pb.display().to_string(),
+ udf_env
+ )
+ .into());
+ }
+ }
+ }
+
+ // The env is undefined or it's value is trimmed to empty, let's try default dir.
+
+ // env "CARGO_MANIFEST_DIR" is "the directory containing the manifest of your package",
+ // set by `cargo run` or `cargo test`, see:
+ // https://doc.rust-lang.org/cargo/reference/environment-variables.html
+ let dir = env!("CARGO_MANIFEST_DIR");
+
+ let pb = PathBuf::from(dir).join(submodule_data);
+ if pb.is_dir() {
+ Ok(pb)
+ } else {
+ Err(format!(
+ "env `{}` is undefined or has empty value, and the pre-defined data dir `{}` not found",
+ udf_env,
+ pb.display().to_string(),
+ ).into())
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use std::env;
+
+ #[test]
+ fn test_data_dir() {
+ let udf_env = "get_data_dir";
+ let cwd = env::current_dir().unwrap();
+
+ let existing_pb = cwd.join("..");
+ let existing = existing_pb.display().to_string();
+ let existing_str = existing.as_str();
+
+ let non_existing = cwd.join("non-existing-dir").display().to_string();
+ let non_existing_str = non_existing.as_str();
+
+ env::set_var(udf_env, non_existing_str);
+ let res = get_data_dir(udf_env, existing_str);
+ assert!(res.is_err());
+
+ env::set_var(udf_env, "");
+ let res = get_data_dir(udf_env, existing_str);
+ assert!(res.is_ok());
+ assert_eq!(res.unwrap(), existing_pb);
+
+ env::set_var(udf_env, " ");
+ let res = get_data_dir(udf_env, existing_str);
+ assert!(res.is_ok());
+ assert_eq!(res.unwrap(), existing_pb);
+
+ env::set_var(udf_env, existing_str);
+ let res = get_data_dir(udf_env, existing_str);
+ assert!(res.is_ok());
+ assert_eq!(res.unwrap(), existing_pb);
+
+ env::remove_var(udf_env);
+ let res = get_data_dir(udf_env, non_existing_str);
+ assert!(res.is_err());
+
+ let res = get_data_dir(udf_env, existing_str);
+ assert!(res.is_ok());
+ assert_eq!(res.unwrap(), existing_pb);
+ }
+
+ #[test]
+ fn test_happy() {
+ let res = arrow_test_data();
+ assert!(PathBuf::from(res).is_dir());
+
+ let res = parquet_test_data();
+ assert!(PathBuf::from(res).is_dir());
+ }
+}