You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ag...@apache.org on 2020/03/29 18:37:26 UTC

[arrow] branch master updated: ARROW-8256: [Rust] [DataFusion] Update CLI documentation for 0.17.0 release

This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new e33fec7  ARROW-8256: [Rust] [DataFusion] Update CLI documentation for 0.17.0 release
e33fec7 is described below

commit e33fec73fdbe395baae6899fe1a4eb2e4f46705f
Author: Andy Grove <an...@gmail.com>
AuthorDate: Sun Mar 29 12:37:04 2020 -0600

    ARROW-8256: [Rust] [DataFusion] Update CLI documentation for 0.17.0 release
    
    Update CLI documentation for 0.17.0 release and minor improvements to CLI user experience.
    
    Closes #6752 from andygrove/datafusion-cli
    
    Authored-by: Andy Grove <an...@gmail.com>
    Signed-off-by: Andy Grove <an...@gmail.com>
---
 rust/datafusion/README.md       | 122 +++++-----------------------------------
 rust/datafusion/docs/cli.md     |  95 +++++++++++++++++++++++++++++++
 rust/datafusion/src/bin/repl.rs |  38 +++++++++----
 3 files changed, 136 insertions(+), 119 deletions(-)

diff --git a/rust/datafusion/README.md b/rust/datafusion/README.md
index 6c33b35..9b9e8b5 100644
--- a/rust/datafusion/README.md
+++ b/rust/datafusion/README.md
@@ -21,45 +21,18 @@
 
 DataFusion is an in-memory query engine that uses Apache Arrow as the memory model. It supports executing SQL queries against CSV and Parquet files as well as querying directly against in-memory data.
 
-## Usage
+## Using DataFusion as a library
 
-
-#### Use as a lib
-Add this to your Cargo.toml:
+DataFusion can be used as a library by adding the following to your `Cargo.toml` file.
 
 ```toml
 [dependencies]
 datafusion = "1.0.0-SNAPSHOT"
 ```
 
-#### Use as a bin
-##### Build your own bin(requires rust toolchains)
-```sh
-git clone https://github.com/apache/arrow
-cd arrow/rust/datafusion
-cargo run --bin datafusion-cli
-```
-##### Use Dockerfile
-```sh
-git clone https://github.com/apache/arrow
-cd arrow
-docker build -f rust/datafusion/Dockerfile . --tag datafusion-cli
-docker run -it -v $(your_data_location):/data datafusion-cli
-```
-
-```
-USAGE:
-    datafusion-cli [OPTIONS]
-
-FLAGS:
-    -h, --help       Prints help information
-    -V, --version    Prints version information
-
-OPTIONS:
-    -c, --batch-size <batch-size>    The batch size of each query, default value is 1048576
-    -p, --data-path <data-path>      Path to your data, default to current directory
-```
+## Using DataFusion as a binary
 
+DataFusion includes a simple command-line interactive SQL utility. See the [CLI reference](docs/cli.md) for more information.
 
 # Status
 
@@ -69,20 +42,24 @@ OPTIONS:
 - [x] SQL Query Planner
 - [x] Query Optimizer
 - [x] Projection push down
+- [x] Projection push down
 - [ ] Predicate push down
 - [x] Type coercion
-- [ ] Parallel query execution
+- [x] Parallel query execution
 
 ## SQL Support
 
 - [x] Projection
 - [x] Selection
+- [x] Limit
 - [x] Aggregate
+- [x] UDFs
+- [x] Common math functions
+- [ ] Common string functions
+- [ ] Common date/time functions
 - [ ] Sorting
-- [x] Limit
-- [ ] Nested types and dot notation
+- [ ] Nested types
 - [ ] Lists
-- [ ] UDFs
 - [ ] Subqueries
 - [ ] Joins
 
@@ -92,76 +69,5 @@ OPTIONS:
 - [x] Parquet primitive types
 - [ ] Parquet nested types
 
-# Example
-
-Here is a brief example for running a SQL query against a CSV file. See the [examples](examples) directory for full examples.
-
-```rust
-fn main() {
-    // create local execution context
-    let mut ctx = ExecutionContext::new();
-
-    // define schema for data source (csv file)
-    let schema = Arc::new(Schema::new(vec![
-        Field::new("c1", DataType::Utf8, false),
-        Field::new("c2", DataType::UInt32, false),
-        Field::new("c3", DataType::Int8, false),
-        Field::new("c4", DataType::Int16, false),
-        Field::new("c5", DataType::Int32, false),
-        Field::new("c6", DataType::Int64, false),
-        Field::new("c7", DataType::UInt8, false),
-        Field::new("c8", DataType::UInt16, false),
-        Field::new("c9", DataType::UInt32, false),
-        Field::new("c10", DataType::UInt64, false),
-        Field::new("c11", DataType::Float32, false),
-        Field::new("c12", DataType::Float64, false),
-        Field::new("c13", DataType::Utf8, false),
-    ]));
-
-    // register csv file with the execution context
-    let csv_datasource = CsvDataSource::new(
-        "../../testing/data/csv/aggregate_test_100.csv",
-        schema.clone(),
-        1024,
-    );
-    ctx.register_datasource("aggregate_test_100", Rc::new(RefCell::new(csv_datasource)));
-
-    // execute the query
-    let sql = "SELECT c1, MIN(c12), MAX(c12) FROM aggregate_test_100 WHERE c11 > 0.1 AND c11 < 0.9 GROUP BY c1";
-    let relation = ctx.sql(&sql).unwrap();
-    let mut results = relation.borrow_mut();
-
-    // iterate over result batches
-    while let Some(batch) = results.next().unwrap() {
-        println!(
-            "RecordBatch has {} rows and {} columns",
-            batch.num_rows(),
-            batch.num_columns()
-        );
-
-        let c1 = batch
-            .column(0)
-            .as_any()
-            .downcast_ref::<StringArray>()
-            .unwrap();
-
-        let min = batch
-            .column(1)
-            .as_any()
-            .downcast_ref::<Float64Array>()
-            .unwrap();
-
-        let max = batch
-            .column(2)
-            .as_any()
-            .downcast_ref::<Float64Array>()
-            .unwrap();
-
-        for i in 0..batch.num_rows() {
-            let c1_value: String = String::from_utf8(c1.value(i).to_vec()).unwrap();
-
-            println!("{}, Min: {}, Max: {}", c1_value, min.value(i), max.value(i),);
-        }
-    }
-}
-```
+# Examples
+
diff --git a/rust/datafusion/docs/cli.md b/rust/datafusion/docs/cli.md
new file mode 100644
index 0000000..aeacdee
--- /dev/null
+++ b/rust/datafusion/docs/cli.md
@@ -0,0 +1,95 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# DataFusion CLI
+
+The DataFusion CLI is a command-line interactive SQL utility that allows queries to be executed against CSV and Parquet files. It is a convenient way to try DataFusion out with your own data sources.
+
+## Run using Cargo
+
+Use the following commands to clone this repository and run the CLI. This will require the Rust toolchain to be installed. Rust can be installed from [https://rustup.rs/](https://rustup.rs/).
+
+```sh
+git clone https://github.com/apache/arrow
+cd arrow/rust/datafusion
+cargo run --bin datafusion-cli --release
+```
+
+## Run using Docker
+
+Use the following commands to clone this repository and build a Docker image containing the CLI tool. Note that there is `.dockerignore` file in the root of the repository that may need to be deleted in order for this to work.
+
+```sh
+git clone https://github.com/apache/arrow
+cd arrow
+docker build -f rust/datafusion/Dockerfile . --tag datafusion-cli
+docker run -it -v $(your_data_location):/data datafusion-cli
+```
+
+## Usage
+
+```
+USAGE:
+    datafusion-cli [OPTIONS]
+
+FLAGS:
+    -h, --help       Prints help information
+    -V, --version    Prints version information
+
+OPTIONS:
+    -c, --batch-size <batch-size>    The batch size of each query, default value is 1048576
+    -p, --data-path <data-path>      Path to your data, default to current directory
+```
+
+Type `exit` or `quit` to exit the CLI.
+
+## Registering Parquet Data Sources
+
+Parquet data sources can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement. It is not necessary to provide schema information for Parquet files.
+
+```sql
+CREATE EXTERNAL TABLE taxi 
+STORED AS PARQUET
+LOCATION '/mnt/nyctaxi/tripdata.parquet';
+```
+
+## Registering CSV Data Sources
+
+CSV data sources can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement. It is necessary to provide schema information for CSV files since DataFusion does not automatically infer the schema when using SQL to query CSV files.
+
+```sql
+CREATE EXTERNAL TABLE test (
+    c1  VARCHAR NOT NULL,
+    c2  INT NOT NULL,
+    c3  SMALLINT NOT NULL,
+    c4  SMALLINT NOT NULL,
+    c5  INT NOT NULL,
+    c6  BIGINT NOT NULL,
+    c7  SMALLINT NOT NULL,
+    c8  INT NOT NULL,
+    c9  BIGINT NOT NULL,
+    c10 VARCHAR NOT NULL,
+    c11 FLOAT NOT NULL,
+    c12 DOUBLE NOT NULL,
+    c13 VARCHAR NOT NULL
+)
+STORED AS CSV
+WITH HEADER ROW
+LOCATION '/path/to/aggregate_test_100.csv';
+```
diff --git a/rust/datafusion/src/bin/repl.rs b/rust/datafusion/src/bin/repl.rs
index d8aa21e..ea9cd2c 100644
--- a/rust/datafusion/src/bin/repl.rs
+++ b/rust/datafusion/src/bin/repl.rs
@@ -19,7 +19,6 @@
 
 use arrow::array::*;
 use arrow::datatypes::{DataType, TimeUnit};
-use arrow::record_batch::RecordBatch;
 use clap::{crate_version, App, Arg};
 use datafusion::error::{ExecutionError, Result};
 use datafusion::execution::context::ExecutionContext;
@@ -27,6 +26,7 @@ use prettytable::{Cell, Row, Table};
 use rustyline::Editor;
 use std::env;
 use std::path::Path;
+use std::time::Instant;
 
 pub fn main() {
     let matches = App::new("DataFusion")
@@ -71,6 +71,9 @@ pub fn main() {
     loop {
         let readline = rl.readline("> ");
         match readline {
+            Ok(ref line) if is_exit_command(line) && query.is_empty() => {
+                break;
+            }
             Ok(ref line) if line.trim_end().ends_with(';') => {
                 query.push_str(line.trim_end());
                 rl.add_history_entry(query.clone());
@@ -93,25 +96,30 @@ pub fn main() {
     rl.save_history(".history").ok();
 }
 
+fn is_exit_command(line: &str) -> bool {
+    let line = line.trim_end().to_lowercase();
+    line == "quit" || line == "exit"
+}
+
 fn exec_and_print(
     ctx: &mut ExecutionContext,
     sql: String,
     batch_size: usize,
 ) -> Result<()> {
-    let results = ctx.sql(&sql, batch_size)?;
-    print_result(&results)?;
+    let now = Instant::now();
 
-    Ok(())
-}
-
-fn print_result(results: &Vec<RecordBatch>) -> Result<()> {
-    let mut row_count = 0;
-    let mut table = Table::new();
+    let results = ctx.sql(&sql, batch_size)?;
 
     if results.is_empty() {
+        println!(
+            "0 rows in set. Query took {} seconds.",
+            now.elapsed().as_secs()
+        );
         return Ok(());
     }
 
+    let mut row_count = 0;
+    let mut table = Table::new();
     let schema = results[0].schema();
 
     let mut header = Vec::new();
@@ -135,9 +143,17 @@ fn print_result(results: &Vec<RecordBatch>) -> Result<()> {
     table.printstd();
 
     if row_count > 1 {
-        println!("{} rows in set.", row_count);
+        println!(
+            "{} row in set. Query took {} seconds.",
+            row_count,
+            now.elapsed().as_secs()
+        );
     } else {
-        println!("{} row in set.", row_count);
+        println!(
+            "{} rows in set. Query took {} seconds.",
+            row_count,
+            now.elapsed().as_secs()
+        );
     }
 
     Ok(())