You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2019/04/01 03:02:00 UTC
[arrow] branch master updated: ARROW-5053: [Rust] [DataFusion] Use
ARROW_TEST_DATA env var
This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 20a3e6d ARROW-5053: [Rust] [DataFusion] Use ARROW_TEST_DATA env var
20a3e6d is described below
commit 20a3e6d2be544b21987443c3bd04b3eb126e29f4
Author: Andy Grove <an...@gmail.com>
AuthorDate: Mon Apr 1 12:01:49 2019 +0900
ARROW-5053: [Rust] [DataFusion] Use ARROW_TEST_DATA env var
Use `ARROW_TEST_DATA` env var instead of hard-coded relative paths.
Author: Andy Grove <an...@gmail.com>
Closes #4068 from andygrove/ARROW-5053 and squashes the following commits:
1b807f16 <Andy Grove> address PR feedback
ecd5d1c1 <Andy Grove> fix typo
cbf5b2a0 <Andy Grove> Change directory from testing to arrow-testing for consistency
635eb09b <Andy Grove> set ARROW_TEST_DATA in verify-release-candidate
55c07e06 <Andy Grove> set ARROW_TEST_DATA in verify-release-candidate
2c51e4cb <Andy Grove> Set ARROW_TEST_DATA in CI
fa0dca63 <Andy Grove> delete kcov files
06ae7e77 <Andy Grove> delete kcov files
3d636309 <Andy Grove> remove kvoc files
e3c69201 <Andy Grove> Use ARROW_TEST_DATA env var to locate test data files
---
ci/cpp-msvc-build-main.bat | 1 +
ci/rust-build-main.bat | 1 +
ci/travis_env_common.sh | 1 +
dev/release/verify-release-candidate.bat | 3 +++
dev/release/verify-release-candidate.sh | 6 +++---
rust/README.md | 10 ++++++++--
rust/datafusion/benches/aggregate_query_sql.rs | 5 ++++-
rust/datafusion/src/datasource/parquet.rs | 3 ++-
rust/datafusion/src/execution/aggregate.rs | 13 ++++++++++---
rust/datafusion/src/execution/projection.rs | 5 ++++-
rust/datafusion/src/table.rs | 4 +++-
rust/datafusion/tests/sql.rs | 5 +++--
12 files changed, 43 insertions(+), 14 deletions(-)
diff --git a/ci/cpp-msvc-build-main.bat b/ci/cpp-msvc-build-main.bat
index da28d88..13dab24 100644
--- a/ci/cpp-msvc-build-main.bat
+++ b/ci/cpp-msvc-build-main.bat
@@ -35,6 +35,7 @@ if "%JOB%" == "Toolchain" (
@rem Retrieve git submodules, configure env var for Parquet unit tests
git submodule update --init || exit /B
+set ARROW_TEST_DATA=%CD%\testing\data
set PARQUET_TEST_DATA=%CD%\cpp\submodules\parquet-testing\data
@rem Enable warnings-as-errors
diff --git a/ci/rust-build-main.bat b/ci/rust-build-main.bat
index b36a97a..5bf1c84 100644
--- a/ci/rust-build-main.bat
+++ b/ci/rust-build-main.bat
@@ -19,6 +19,7 @@
@rem Retrieve git submodules, configure env var for Parquet unit tests
git submodule update --init || exit /B
+set ARROW_TEST_DATA=%CD%\testing\data
set PARQUET_TEST_DATA=%CD%\cpp\submodules\parquet-testing\data
pushd rust
diff --git a/ci/travis_env_common.sh b/ci/travis_env_common.sh
index a1cc125..d0cc415 100755
--- a/ci/travis_env_common.sh
+++ b/ci/travis_env_common.sh
@@ -73,6 +73,7 @@ if [ $TRAVIS_OS_NAME == "osx" ]; then
export GOPATH=$TRAVIS_BUILD_DIR/gopath
fi
+export ARROW_TEST_DATA=$TRAVIS_BUILD_DIR/testing/data
export PARQUET_TEST_DATA=$TRAVIS_BUILD_DIR/cpp/submodules/parquet-testing/data
# e.g. "trusty" or "xenial"
diff --git a/dev/release/verify-release-candidate.bat b/dev/release/verify-release-candidate.bat
index 5072112..3f6d95c 100644
--- a/dev/release/verify-release-candidate.bat
+++ b/dev/release/verify-release-candidate.bat
@@ -84,6 +84,9 @@ cmake --build . --target INSTALL --config %CONFIGURATION% || exit /B
git clone https://github.com/apache/parquet-testing.git %_VERIFICATION_DIR%\parquet-testing
set PARQUET_TEST_DATA=%_VERIFICATION_DIR%\parquet-testing\data
+git clone https://github.com/apache/arrow-testing.git %_VERIFICATION_DIR%\arrow-testing
+set ARROW_TEST_DATA=%_VERIFICATION_DIR%\arrow-testing\data
+
@rem Needed so python-test.exe works
set PYTHONPATH=%CONDA_PREFIX%\Lib;%CONDA_PREFIX%\Lib\site-packages;%CONDA_PREFIX%\python35.zip;%CONDA_PREFIX%\DLLs;%CONDA_PREFIX%;%PYTHONPATH%
diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index c8c631c..3b558d0 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -403,6 +403,9 @@ if [ "$ARTIFACT" == "source" ]; then
TEST_JAVA=$((${TEST_JAVA} + ${TEST_INTEGRATION}))
TEST_JS=$((${TEST_JS} + ${TEST_INTEGRATION}))
+ git clone https://github.com/apache/arrow-testing.git
+ export ARROW_TEST_DATA=$PWD/arrow-testing/data
+
git clone https://github.com/apache/parquet-testing.git
export PARQUET_TEST_DATA=$PWD/parquet-testing/data
@@ -410,9 +413,6 @@ if [ "$ARTIFACT" == "source" ]; then
tar xvzf ${DIST_NAME}.tar.gz
cd ${DIST_NAME}
- rm -r testing
- git clone https://github.com/apache/arrow-testing.git testing
-
if [ ${TEST_JAVA} -gt 0 ]; then
test_package_java
fi
diff --git a/rust/README.md b/rust/README.md
index a968052..377cf6c 100644
--- a/rust/README.md
+++ b/rust/README.md
@@ -46,8 +46,14 @@ This populates data in two git submodules:
- `cpp/submodules/parquet_testing/data` (sourced from https://github.com/apache/parquet-testing.git)
- `testing` (sourced from https://github.com/apache/arrow-testing)
-Create a new environment variable called `PARQUET_TEST_DATA` to point
-to `cpp/submodules/parquet-testing/data` and then `cargo test` as usual.
+Create two new environment variables to point to these directories as follows:
+
+```bash
+export PARQUET_TEST_DATA=/path/to/arrow/cpp/submodules/parquet-testing/data
+export ARROW_TEST_DATA=/path/to/arrow/testing/data/
+```
+
+It is now possible to run `cargo test` as usual.
## Code Formatting
diff --git a/rust/datafusion/benches/aggregate_query_sql.rs b/rust/datafusion/benches/aggregate_query_sql.rs
index bde8787..d0c1b0f 100644
--- a/rust/datafusion/benches/aggregate_query_sql.rs
+++ b/rust/datafusion/benches/aggregate_query_sql.rs
@@ -20,6 +20,7 @@ extern crate criterion;
use criterion::Criterion;
use std::cell::RefCell;
+use std::env;
use std::rc::Rc;
use std::sync::Arc;
@@ -60,9 +61,11 @@ fn create_context() -> Rc<RefCell<ExecutionContext>> {
Field::new("c13", DataType::Utf8, false),
]));
+ let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined");
+
// create CSV data source
let csv = CsvFile::new(
- "../../testing/data/csv/aggregate_test_100.csv",
+ &format!("{}/csv/aggregate_test_100.csv", testdata),
&schema,
true,
);
diff --git a/rust/datafusion/src/datasource/parquet.rs b/rust/datafusion/src/datasource/parquet.rs
index 79e84ce..b645bce 100644
--- a/rust/datafusion/src/datasource/parquet.rs
+++ b/rust/datafusion/src/datasource/parquet.rs
@@ -678,7 +678,8 @@ mod tests {
}
fn load_table(name: &str) -> Box<TableProvider> {
- let testdata = env::var("PARQUET_TEST_DATA").unwrap();
+ let testdata =
+ env::var("PARQUET_TEST_DATA").expect("PARQUET_TEST_DATA not defined");
let filename = format!("{}/{}", testdata, name);
let table = ParquetTable::try_new(&filename).unwrap();
Box::new(table)
diff --git a/rust/datafusion/src/execution/aggregate.rs b/rust/datafusion/src/execution/aggregate.rs
index 4417d13..84fe925 100644
--- a/rust/datafusion/src/execution/aggregate.rs
+++ b/rust/datafusion/src/execution/aggregate.rs
@@ -1030,12 +1030,15 @@ mod tests {
use crate::execution::relation::DataSourceRelation;
use crate::logicalplan::Expr;
use arrow::datatypes::{DataType, Field, Schema};
+ use std::env;
use std::sync::Mutex;
#[test]
fn min_f64_group_by_string() {
let schema = aggr_test_schema();
- let relation = load_csv("../../testing/data/csv/aggregate_test_100.csv", &schema);
+ let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined");
+ let relation =
+ load_csv(&format!("{}/csv/aggregate_test_100.csv", testdata), &schema);
let context = ExecutionContext::new();
let aggr_expr = vec![expression::compile_aggregate_expr(
@@ -1070,7 +1073,9 @@ mod tests {
#[test]
fn max_f64_group_by_string() {
let schema = aggr_test_schema();
- let relation = load_csv("../../testing/data/csv/aggregate_test_100.csv", &schema);
+ let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined");
+ let relation =
+ load_csv(&format!("{}/csv/aggregate_test_100.csv", testdata), &schema);
let context = ExecutionContext::new();
let aggr_expr = vec![expression::compile_aggregate_expr(
@@ -1105,7 +1110,9 @@ mod tests {
#[test]
fn test_min_max_sum_f64_group_by_uint32() {
let schema = aggr_test_schema();
- let relation = load_csv("../../testing/data/csv/aggregate_test_100.csv", &schema);
+ let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined");
+ let relation =
+ load_csv(&format!("{}/csv/aggregate_test_100.csv", testdata), &schema);
let context = ExecutionContext::new();
diff --git a/rust/datafusion/src/execution/projection.rs b/rust/datafusion/src/execution/projection.rs
index cf91409..6f458cd 100644
--- a/rust/datafusion/src/execution/projection.rs
+++ b/rust/datafusion/src/execution/projection.rs
@@ -93,6 +93,7 @@ mod tests {
use crate::execution::relation::DataSourceRelation;
use crate::logicalplan::Expr;
use arrow::datatypes::{DataType, Field, Schema};
+ use std::env;
use std::sync::Mutex;
#[test]
@@ -113,8 +114,10 @@ mod tests {
Field::new("c12", DataType::Utf8, false),
]));
+ let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined");
+
let ds = CsvBatchIterator::new(
- "../../testing/data/csv/aggregate_test_100.csv",
+ &format!("{}/csv/aggregate_test_100.csv", testdata),
schema.clone(),
true,
&None,
diff --git a/rust/datafusion/src/table.rs b/rust/datafusion/src/table.rs
index b1421c6..a7720f2 100644
--- a/rust/datafusion/src/table.rs
+++ b/rust/datafusion/src/table.rs
@@ -39,6 +39,7 @@ mod tests {
use super::*;
use crate::execution::context::ExecutionContext;
use arrow::datatypes::*;
+ use std::env;
#[test]
fn demonstrate_api_usage() {
@@ -60,9 +61,10 @@ mod tests {
fn register_aggregate_csv(ctx: &mut ExecutionContext) {
let schema = aggr_test_schema();
+ let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined");
ctx.register_csv(
"aggregate_test_100",
- "../../testing/data/csv/aggregate_test_100.csv",
+ &format!("{}/csv/aggregate_test_100.csv", testdata),
&schema,
true,
);
diff --git a/rust/datafusion/tests/sql.rs b/rust/datafusion/tests/sql.rs
index 6a61eb5..7b04609 100644
--- a/rust/datafusion/tests/sql.rs
+++ b/rust/datafusion/tests/sql.rs
@@ -171,11 +171,12 @@ fn aggr_test_schema() -> Arc<Schema> {
}
fn register_aggregate_csv(ctx: &mut ExecutionContext) {
+ let testdata = env::var("ARROW_TEST_DATA").expect("ARROW_TEST_DATA not defined");
let schema = aggr_test_schema();
register_csv(
ctx,
"aggregate_test_100",
- "../../testing/data/csv/aggregate_test_100.csv",
+ &format!("{}/csv/aggregate_test_100.csv", testdata),
&schema,
);
}
@@ -190,7 +191,7 @@ fn register_csv(
}
fn load_parquet_table(name: &str) -> Rc<TableProvider> {
- let testdata = env::var("PARQUET_TEST_DATA").unwrap();
+ let testdata = env::var("PARQUET_TEST_DATA").expect("PARQUET_TEST_DATA not defined");
let filename = format!("{}/{}", testdata, name);
let table = ParquetTable::try_new(&filename).unwrap();
Rc::new(table)