You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ag...@apache.org on 2020/03/29 18:37:26 UTC
[arrow] branch master updated: ARROW-8256: [Rust] [DataFusion]
Update CLI documentation for 0.17.0 release
This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new e33fec7 ARROW-8256: [Rust] [DataFusion] Update CLI documentation for 0.17.0 release
e33fec7 is described below
commit e33fec73fdbe395baae6899fe1a4eb2e4f46705f
Author: Andy Grove <an...@gmail.com>
AuthorDate: Sun Mar 29 12:37:04 2020 -0600
ARROW-8256: [Rust] [DataFusion] Update CLI documentation for 0.17.0 release
Update CLI documentation for 0.17.0 release and minor improvements to CLI user experience.
Closes #6752 from andygrove/datafusion-cli
Authored-by: Andy Grove <an...@gmail.com>
Signed-off-by: Andy Grove <an...@gmail.com>
---
rust/datafusion/README.md | 122 +++++-----------------------------------
rust/datafusion/docs/cli.md | 95 +++++++++++++++++++++++++++++++
rust/datafusion/src/bin/repl.rs | 38 +++++++++----
3 files changed, 136 insertions(+), 119 deletions(-)
diff --git a/rust/datafusion/README.md b/rust/datafusion/README.md
index 6c33b35..9b9e8b5 100644
--- a/rust/datafusion/README.md
+++ b/rust/datafusion/README.md
@@ -21,45 +21,18 @@
DataFusion is an in-memory query engine that uses Apache Arrow as the memory model. It supports executing SQL queries against CSV and Parquet files as well as querying directly against in-memory data.
-## Usage
+## Using DataFusion as a library
-
-#### Use as a lib
-Add this to your Cargo.toml:
+DataFusion can be used as a library by adding the following to your `Cargo.toml` file.
```toml
[dependencies]
datafusion = "1.0.0-SNAPSHOT"
```
-#### Use as a bin
-##### Build your own bin(requires rust toolchains)
-```sh
-git clone https://github.com/apache/arrow
-cd arrow/rust/datafusion
-cargo run --bin datafusion-cli
-```
-##### Use Dockerfile
-```sh
-git clone https://github.com/apache/arrow
-cd arrow
-docker build -f rust/datafusion/Dockerfile . --tag datafusion-cli
-docker run -it -v $(your_data_location):/data datafusion-cli
-```
-
-```
-USAGE:
- datafusion-cli [OPTIONS]
-
-FLAGS:
- -h, --help Prints help information
- -V, --version Prints version information
-
-OPTIONS:
- -c, --batch-size <batch-size> The batch size of each query, default value is 1048576
- -p, --data-path <data-path> Path to your data, default to current directory
-```
+## Using DataFusion as a binary
+DataFusion includes a simple command-line interactive SQL utility. See the [CLI reference](docs/cli.md) for more information.
# Status
@@ -69,20 +42,24 @@ OPTIONS:
- [x] SQL Query Planner
- [x] Query Optimizer
- [x] Projection push down
+- [x] Projection push down
- [ ] Predicate push down
- [x] Type coercion
-- [ ] Parallel query execution
+- [x] Parallel query execution
## SQL Support
- [x] Projection
- [x] Selection
+- [x] Limit
- [x] Aggregate
+- [x] UDFs
+- [x] Common math functions
+- [ ] Common string functions
+- [ ] Common date/time functions
- [ ] Sorting
-- [x] Limit
-- [ ] Nested types and dot notation
+- [ ] Nested types
- [ ] Lists
-- [ ] UDFs
- [ ] Subqueries
- [ ] Joins
@@ -92,76 +69,5 @@ OPTIONS:
- [x] Parquet primitive types
- [ ] Parquet nested types
-# Example
-
-Here is a brief example for running a SQL query against a CSV file. See the [examples](examples) directory for full examples.
-
-```rust
-fn main() {
- // create local execution context
- let mut ctx = ExecutionContext::new();
-
- // define schema for data source (csv file)
- let schema = Arc::new(Schema::new(vec![
- Field::new("c1", DataType::Utf8, false),
- Field::new("c2", DataType::UInt32, false),
- Field::new("c3", DataType::Int8, false),
- Field::new("c4", DataType::Int16, false),
- Field::new("c5", DataType::Int32, false),
- Field::new("c6", DataType::Int64, false),
- Field::new("c7", DataType::UInt8, false),
- Field::new("c8", DataType::UInt16, false),
- Field::new("c9", DataType::UInt32, false),
- Field::new("c10", DataType::UInt64, false),
- Field::new("c11", DataType::Float32, false),
- Field::new("c12", DataType::Float64, false),
- Field::new("c13", DataType::Utf8, false),
- ]));
-
- // register csv file with the execution context
- let csv_datasource = CsvDataSource::new(
- "../../testing/data/csv/aggregate_test_100.csv",
- schema.clone(),
- 1024,
- );
- ctx.register_datasource("aggregate_test_100", Rc::new(RefCell::new(csv_datasource)));
-
- // execute the query
- let sql = "SELECT c1, MIN(c12), MAX(c12) FROM aggregate_test_100 WHERE c11 > 0.1 AND c11 < 0.9 GROUP BY c1";
- let relation = ctx.sql(&sql).unwrap();
- let mut results = relation.borrow_mut();
-
- // iterate over result batches
- while let Some(batch) = results.next().unwrap() {
- println!(
- "RecordBatch has {} rows and {} columns",
- batch.num_rows(),
- batch.num_columns()
- );
-
- let c1 = batch
- .column(0)
- .as_any()
- .downcast_ref::<StringArray>()
- .unwrap();
-
- let min = batch
- .column(1)
- .as_any()
- .downcast_ref::<Float64Array>()
- .unwrap();
-
- let max = batch
- .column(2)
- .as_any()
- .downcast_ref::<Float64Array>()
- .unwrap();
-
- for i in 0..batch.num_rows() {
- let c1_value: String = String::from_utf8(c1.value(i).to_vec()).unwrap();
-
- println!("{}, Min: {}, Max: {}", c1_value, min.value(i), max.value(i),);
- }
- }
-}
-```
+# Examples
+
diff --git a/rust/datafusion/docs/cli.md b/rust/datafusion/docs/cli.md
new file mode 100644
index 0000000..aeacdee
--- /dev/null
+++ b/rust/datafusion/docs/cli.md
@@ -0,0 +1,95 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# DataFusion CLI
+
+The DataFusion CLI is a command-line interactive SQL utility that allows queries to be executed against CSV and Parquet files. It is a convenient way to try DataFusion out with your own data sources.
+
+## Run using Cargo
+
+Use the following commands to clone this repository and run the CLI. This will require the Rust toolchain to be installed. Rust can be installed from [https://rustup.rs/](https://rustup.rs/).
+
+```sh
+git clone https://github.com/apache/arrow
+cd arrow/rust/datafusion
+cargo run --bin datafusion-cli --release
+```
+
+## Run using Docker
+
+Use the following commands to clone this repository and build a Docker image containing the CLI tool. Note that there is `.dockerignore` file in the root of the repository that may need to be deleted in order for this to work.
+
+```sh
+git clone https://github.com/apache/arrow
+cd arrow
+docker build -f rust/datafusion/Dockerfile . --tag datafusion-cli
+docker run -it -v $(your_data_location):/data datafusion-cli
+```
+
+## Usage
+
+```
+USAGE:
+ datafusion-cli [OPTIONS]
+
+FLAGS:
+ -h, --help Prints help information
+ -V, --version Prints version information
+
+OPTIONS:
+ -c, --batch-size <batch-size> The batch size of each query, default value is 1048576
+ -p, --data-path <data-path> Path to your data, default to current directory
+```
+
+Type `exit` or `quit` to exit the CLI.
+
+## Registering Parquet Data Sources
+
+Parquet data sources can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement. It is not necessary to provide schema information for Parquet files.
+
+```sql
+CREATE EXTERNAL TABLE taxi
+STORED AS PARQUET
+LOCATION '/mnt/nyctaxi/tripdata.parquet';
+```
+
+## Registering CSV Data Sources
+
+CSV data sources can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement. It is necessary to provide schema information for CSV files since DataFusion does not automatically infer the schema when using SQL to query CSV files.
+
+```sql
+CREATE EXTERNAL TABLE test (
+ c1 VARCHAR NOT NULL,
+ c2 INT NOT NULL,
+ c3 SMALLINT NOT NULL,
+ c4 SMALLINT NOT NULL,
+ c5 INT NOT NULL,
+ c6 BIGINT NOT NULL,
+ c7 SMALLINT NOT NULL,
+ c8 INT NOT NULL,
+ c9 BIGINT NOT NULL,
+ c10 VARCHAR NOT NULL,
+ c11 FLOAT NOT NULL,
+ c12 DOUBLE NOT NULL,
+ c13 VARCHAR NOT NULL
+)
+STORED AS CSV
+WITH HEADER ROW
+LOCATION '/path/to/aggregate_test_100.csv';
+```
diff --git a/rust/datafusion/src/bin/repl.rs b/rust/datafusion/src/bin/repl.rs
index d8aa21e..ea9cd2c 100644
--- a/rust/datafusion/src/bin/repl.rs
+++ b/rust/datafusion/src/bin/repl.rs
@@ -19,7 +19,6 @@
use arrow::array::*;
use arrow::datatypes::{DataType, TimeUnit};
-use arrow::record_batch::RecordBatch;
use clap::{crate_version, App, Arg};
use datafusion::error::{ExecutionError, Result};
use datafusion::execution::context::ExecutionContext;
@@ -27,6 +26,7 @@ use prettytable::{Cell, Row, Table};
use rustyline::Editor;
use std::env;
use std::path::Path;
+use std::time::Instant;
pub fn main() {
let matches = App::new("DataFusion")
@@ -71,6 +71,9 @@ pub fn main() {
loop {
let readline = rl.readline("> ");
match readline {
+ Ok(ref line) if is_exit_command(line) && query.is_empty() => {
+ break;
+ }
Ok(ref line) if line.trim_end().ends_with(';') => {
query.push_str(line.trim_end());
rl.add_history_entry(query.clone());
@@ -93,25 +96,30 @@ pub fn main() {
rl.save_history(".history").ok();
}
+fn is_exit_command(line: &str) -> bool {
+ let line = line.trim_end().to_lowercase();
+ line == "quit" || line == "exit"
+}
+
fn exec_and_print(
ctx: &mut ExecutionContext,
sql: String,
batch_size: usize,
) -> Result<()> {
- let results = ctx.sql(&sql, batch_size)?;
- print_result(&results)?;
+ let now = Instant::now();
- Ok(())
-}
-
-fn print_result(results: &Vec<RecordBatch>) -> Result<()> {
- let mut row_count = 0;
- let mut table = Table::new();
+ let results = ctx.sql(&sql, batch_size)?;
if results.is_empty() {
+ println!(
+ "0 rows in set. Query took {} seconds.",
+ now.elapsed().as_secs()
+ );
return Ok(());
}
+ let mut row_count = 0;
+ let mut table = Table::new();
let schema = results[0].schema();
let mut header = Vec::new();
@@ -135,9 +143,17 @@ fn print_result(results: &Vec<RecordBatch>) -> Result<()> {
table.printstd();
if row_count > 1 {
- println!("{} rows in set.", row_count);
+ println!(
+ "{} row in set. Query took {} seconds.",
+ row_count,
+ now.elapsed().as_secs()
+ );
} else {
- println!("{} row in set.", row_count);
+ println!(
+ "{} rows in set. Query took {} seconds.",
+ row_count,
+ now.elapsed().as_secs()
+ );
}
Ok(())