You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2019/04/01 03:02:00 UTC
[arrow] branch master updated: ARROW-5053: [Rust] [DataFusion] Use ARROW_TEST_DATA env var

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 20a3e6d  ARROW-5053: [Rust] [DataFusion] Use ARROW_TEST_DATA env var
20a3e6d is described below

commit 20a3e6d2be544b21987443c3bd04b3eb126e29f4
Author: Andy Grove <an...@gmail.com>
AuthorDate: Mon Apr 1 12:01:49 2019 +0900

    ARROW-5053: [Rust] [DataFusion] Use ARROW_TEST_DATA env var
    
    Use `ARROW_TEST_DATA` env var instead of hard-coded relative paths.
    
    Author: Andy Grove <an...@gmail.com>
    
    Closes #4068 from andygrove/ARROW-5053 and squashes the following commits:
    
    1b807f16 <Andy Grove> address PR feedback
    ecd5d1c1 <Andy Grove> fix typo
    cbf5b2a0 <Andy Grove> Change directory from testing to arrow-testing for consistency
    635eb09b <Andy Grove> set ARROW_TEST_DATA in verify-release-candidate
    55c07e06 <Andy Grove> set ARROW_TEST_DATA in verify-release-candidate
    2c51e4cb <Andy Grove> Set ARROW_TEST_DATA in CI
    fa0dca63 <Andy Grove> delete kcov files
    06ae7e77 <Andy Grove> delete kcov files
    3d636309 <Andy Grove> remove kvoc files
    e3c69201 <Andy Grove> Use ARROW_TEST_DATA env var to locate test data files
---
 ci/cpp-msvc-build-main.bat                     |  1 +
 ci/rust-build-main.bat                         |  1 +
 ci/travis_env_common.sh                        |  1 +
 dev/release/verify-release-candidate.bat       |  3 +++
 dev/release/verify-release-candidate.sh        |  6 +++---
 rust/README.md                                 | 10 ++++++++--
 rust/datafusion/benches/aggregate_query_sql.rs |  5 ++++-
 rust/datafusion/src/datasource/parquet.rs      |  3 ++-
 rust/datafusion/src/execution/aggregate.rs     | 13 ++++++++++---
 rust/datafusion/src/execution/projection.rs    |  5 ++++-
 rust/datafusion/src/table.rs                   |  4 +++-
 rust/datafusion/tests/sql.rs                   |  5 +++--
 12 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/ci/cpp-msvc-build-main.bat b/ci/cpp-msvc-build-main.bat
index da28d88..13dab24 100644
--- a/ci/cpp-msvc-build-main.bat
+++ b/ci/cpp-msvc-build-main.bat
@@ -35,6 +35,7 @@ if "%JOB%" == "Toolchain" (
 
 @rem Retrieve git submodules, configure env var for Parquet unit tests
 git submodule update --init || exit /B
+set ARROW_TEST_DATA=%CD%\testing\data
 set PARQUET_TEST_DATA=%CD%\cpp\submodules\parquet-testing\data
 
 @rem Enable warnings-as-errors
diff --git a/ci/rust-build-main.bat b/ci/rust-build-main.bat
index b36a97a..5bf1c84 100644
--- a/ci/rust-build-main.bat
+++ b/ci/rust-build-main.bat
@@ -19,6 +19,7 @@
 
 @rem Retrieve git submodules, configure env var for Parquet unit tests
 git submodule update --init || exit /B
+set ARROW_TEST_DATA=%CD%\testing\data
 set PARQUET_TEST_DATA=%CD%\cpp\submodules\parquet-testing\data
 pushd rust
 
diff --git a/ci/travis_env_common.sh b/ci/travis_env_common.sh
index a1cc125..d0cc415 100755
--- a/ci/travis_env_common.sh
+++ b/ci/travis_env_common.sh
@@ -73,6 +73,7 @@ if [ $TRAVIS_OS_NAME == "osx" ]; then
   export GOPATH=$TRAVIS_BUILD_DIR/gopath
 fi
 
+export ARROW_TEST_DATA=$TRAVIS_BUILD_DIR/testing/data
 export PARQUET_TEST_DATA=$TRAVIS_BUILD_DIR/cpp/submodules/parquet-testing/data
 
 # e.g. "trusty" or "xenial"
diff --git a/dev/release/verify-release-candidate.bat b/dev/release/verify-release-candidate.bat
index 5072112..3f6d95c 100644
--- a/dev/release/verify-release-candidate.bat
+++ b/dev/release/verify-release-candidate.bat
@@ -84,6 +84,9 @@ cmake --build . --target INSTALL --config %CONFIGURATION%  || exit /B
 git clone https://github.com/apache/parquet-testing.git %_VERIFICATION_DIR%\parquet-testing
 set PARQUET_TEST_DATA=%_VERIFICATION_DIR%\parquet-testing\data
 
+git clone https://github.com/apache/arrow-testing.git %_VERIFICATION_DIR%\arrow-testing
+set ARROW_TEST_DATA=%_VERIFICATION_DIR%\arrow-testing\data
+
 @rem Needed so python-test.exe works
 set PYTHONPATH=%CONDA_PREFIX%\Lib;%CONDA_PREFIX%\Lib\site-packages;%CONDA_PREFIX%\python35.zip;%CONDA_PREFIX%\DLLs;%CONDA_PREFIX%;%PYTHONPATH%
 
diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index c8c631c..3b558d0 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -403,6 +403,9 @@ if [ "$ARTIFACT" == "source" ]; then
   TEST_JAVA=$((${TEST_JAVA} + ${TEST_INTEGRATION}))
   TEST_JS=$((${TEST_JS} + ${TEST_INTEGRATION}))
 
+  git clone https://github.com/apache/arrow-testing.git
+  export ARROW_TEST_DATA=$PWD/arrow-testing/data
+
   git clone https://github.com/apache/parquet-testing.git
   export PARQUET_TEST_DATA=$PWD/parquet-testing/data
 
@@ -410,9 +413,6 @@ if [ "$ARTIFACT" == "source" ]; then
   tar xvzf ${DIST_NAME}.tar.gz
   cd ${DIST_NAME}
 
-  rm -r testing
-  git clone https://github.com/apache/arrow-testing.git testing
-
   if [ ${TEST_JAVA} -gt 0 ]; then
     test_package_java
   fi
diff --git a/rust/README.md b/rust/README.md
index a968052..377cf6c 100644
--- a/rust/README.md
+++ b/rust/README.md
@@ -46,8 +46,14 @@ This populates data in two git submodules:
 - `cpp/submodules/parquet_testing/data` (sourced from https://github.com/apache/parquet-testing.git)
 - `testing` (sourced from https://github.com/apache/arrow-testing)
 
-Create a new environment variable called `PARQUET_TEST_DATA` to point
-to `cpp/submodules/parquet-testing/data` and then `cargo test` as usual.
+Create two new environment variables to point to these directories as follows:
+
+```bash
+export PARQUET_TEST_DATA=/path/to/arrow/cpp/submodules/parquet-testing/data
+export ARROW_TEST_DATA=/path/to/arrow/testing/data/
+```
+
+It is now possible to run `cargo test` as usual.
 
 ## Code Formatting
 
diff --git a/rust/datafusion/benches/aggregate_query_sql.rs b/rust/datafusion/benches/aggregate_query_sql.rs
index bde8787..d0c1b0f 100644
--- a/rust/datafusion/benches/aggregate_query_sql.rs
+++ b/rust/datafusion/benches/aggregate_query_sql.rs
@@ -20,6 +20,7 @@ extern crate criterion;
 use criterion::Criterion;
 
 use std::cell::RefCell;
+use std::env;
 use std::rc::Rc;
 use std::sync::Arc;
 
@@ -60,9 +61,11 @@ fn create_context() -> Rc<RefCell<ExecutionContext>> {
         Field::new("c13", DataType::Utf8, false),
     ]));
 
+    let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined");
+
     // create CSV data source
     let csv = CsvFile::new(
-        "../../testing/data/csv/aggregate_test_100.csv",
+        &format!("{}/csv/aggregate_test_100.csv", testdata),
         &schema,
         true,
     );
diff --git a/rust/datafusion/src/datasource/parquet.rs b/rust/datafusion/src/datasource/parquet.rs
index 79e84ce..b645bce 100644
--- a/rust/datafusion/src/datasource/parquet.rs
+++ b/rust/datafusion/src/datasource/parquet.rs
@@ -678,7 +678,8 @@ mod tests {
     }
 
     fn load_table(name: &str) -> Box<TableProvider> {
-        let testdata = env::var("PARQUET_TEST_DATA").unwrap();
+        let testdata =
+            env::var("PARQUET_TEST_DATA").expect("PARQUET_TEST_DATA not defined");
         let filename = format!("{}/{}", testdata, name);
         let table = ParquetTable::try_new(&filename).unwrap();
         Box::new(table)
diff --git a/rust/datafusion/src/execution/aggregate.rs b/rust/datafusion/src/execution/aggregate.rs
index 4417d13..84fe925 100644
--- a/rust/datafusion/src/execution/aggregate.rs
+++ b/rust/datafusion/src/execution/aggregate.rs
@@ -1030,12 +1030,15 @@ mod tests {
     use crate::execution::relation::DataSourceRelation;
     use crate::logicalplan::Expr;
     use arrow::datatypes::{DataType, Field, Schema};
+    use std::env;
     use std::sync::Mutex;
 
     #[test]
     fn min_f64_group_by_string() {
         let schema = aggr_test_schema();
-        let relation = load_csv("../../testing/data/csv/aggregate_test_100.csv", &schema);
+        let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined");
+        let relation =
+            load_csv(&format!("{}/csv/aggregate_test_100.csv", testdata), &schema);
         let context = ExecutionContext::new();
 
         let aggr_expr = vec![expression::compile_aggregate_expr(
@@ -1070,7 +1073,9 @@ mod tests {
     #[test]
     fn max_f64_group_by_string() {
         let schema = aggr_test_schema();
-        let relation = load_csv("../../testing/data/csv/aggregate_test_100.csv", &schema);
+        let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined");
+        let relation =
+            load_csv(&format!("{}/csv/aggregate_test_100.csv", testdata), &schema);
         let context = ExecutionContext::new();
 
         let aggr_expr = vec![expression::compile_aggregate_expr(
@@ -1105,7 +1110,9 @@ mod tests {
     #[test]
     fn test_min_max_sum_f64_group_by_uint32() {
         let schema = aggr_test_schema();
-        let relation = load_csv("../../testing/data/csv/aggregate_test_100.csv", &schema);
+        let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined");
+        let relation =
+            load_csv(&format!("{}/csv/aggregate_test_100.csv", testdata), &schema);
 
         let context = ExecutionContext::new();
 
diff --git a/rust/datafusion/src/execution/projection.rs b/rust/datafusion/src/execution/projection.rs
index cf91409..6f458cd 100644
--- a/rust/datafusion/src/execution/projection.rs
+++ b/rust/datafusion/src/execution/projection.rs
@@ -93,6 +93,7 @@ mod tests {
     use crate::execution::relation::DataSourceRelation;
     use crate::logicalplan::Expr;
     use arrow::datatypes::{DataType, Field, Schema};
+    use std::env;
     use std::sync::Mutex;
 
     #[test]
@@ -113,8 +114,10 @@ mod tests {
             Field::new("c12", DataType::Utf8, false),
         ]));
 
+        let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined");
+
         let ds = CsvBatchIterator::new(
-            "../../testing/data/csv/aggregate_test_100.csv",
+            &format!("{}/csv/aggregate_test_100.csv", testdata),
             schema.clone(),
             true,
             &None,
diff --git a/rust/datafusion/src/table.rs b/rust/datafusion/src/table.rs
index b1421c6..a7720f2 100644
--- a/rust/datafusion/src/table.rs
+++ b/rust/datafusion/src/table.rs
@@ -39,6 +39,7 @@ mod tests {
     use super::*;
     use crate::execution::context::ExecutionContext;
     use arrow::datatypes::*;
+    use std::env;
 
     #[test]
     fn demonstrate_api_usage() {
@@ -60,9 +61,10 @@ mod tests {
 
     fn register_aggregate_csv(ctx: &mut ExecutionContext) {
         let schema = aggr_test_schema();
+        let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined");
         ctx.register_csv(
             "aggregate_test_100",
-            "../../testing/data/csv/aggregate_test_100.csv",
+            &format!("{}/csv/aggregate_test_100.csv", testdata),
             &schema,
             true,
         );
diff --git a/rust/datafusion/tests/sql.rs b/rust/datafusion/tests/sql.rs
index 6a61eb5..7b04609 100644
--- a/rust/datafusion/tests/sql.rs
+++ b/rust/datafusion/tests/sql.rs
@@ -171,11 +171,12 @@ fn aggr_test_schema() -> Arc<Schema> {
 }
 
 fn register_aggregate_csv(ctx: &mut ExecutionContext) {
+    let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined");
     let schema = aggr_test_schema();
     register_csv(
         ctx,
         "aggregate_test_100",
-        "../../testing/data/csv/aggregate_test_100.csv",
+        &format!("{}/csv/aggregate_test_100.csv", testdata),
         &schema,
     );
 }
@@ -190,7 +191,7 @@ fn register_csv(
 }
 
 fn load_parquet_table(name: &str) -> Rc<TableProvider> {
-    let testdata = env::var("PARQUET_TEST_DATA").unwrap();
+    let testdata = env::var("PARQUET_TEST_DATA").expect("PARQUET_TEST_DATA not defined");
     let filename = format!("{}/{}", testdata, name);
     let table = ParquetTable::try_new(&filename).unwrap();
     Rc::new(table)