You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ag...@apache.org on 2022/05/24 04:27:34 UTC

[arrow-ballista] branch master updated: MINOR: Improve the examples (#34)

This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-ballista.git


The following commit(s) were added to refs/heads/master by this push:
     new 30d21579 MINOR: Improve the examples (#34)
30d21579 is described below

commit 30d2157920d418cbc1b9a58eb6a2819c8e094950
Author: Andy Grove <an...@gmail.com>
AuthorDate: Mon May 23 22:27:30 2022 -0600

    MINOR: Improve the examples (#34)
---
 .github/workflows/rust.yml                         |   4 +-
 Cargo.toml                                         |   2 +-
 README.md                                          |   3 +-
 ballista-examples/README.md                        |  55 --------
 {ballista-examples => examples}/Cargo.toml         |   5 +-
 examples/README.md                                 | 153 +++++++++++++++++++++
 .../examples/standalone-sql.rs                     |   9 +-
 .../src/bin/dataframe.rs                           |   0
 .../bin/ballista-sql.rs => examples/src/bin/sql.rs |   4 +-
 .../testdata/aggregate_test_100.csv                |   0
 .../testdata/alltypes_plain.parquet                | Bin
 11 files changed, 165 insertions(+), 70 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index b2181222..e8ffab29 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -110,8 +110,8 @@ jobs:
           export ARROW_TEST_DATA=$(pwd)/testing/data
           export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data
           cargo test
-          cd ballista-examples
-          cargo run --example test_sql --features=ballista/standalone
+          cd examples
+          cargo run --example standalone_sql --features=ballista/standalone
         env:
           CARGO_HOME: "/github/home/.cargo"
           CARGO_TARGET_DIR: "/github/home/target"
diff --git a/Cargo.toml b/Cargo.toml
index 2e2de363..c54fc112 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,7 +22,7 @@ members = [
     "ballista/rust/core",
     "ballista/rust/executor",
     "ballista/rust/scheduler",
-    "ballista-examples",
+    "examples",
 ]
 exclude = ["ballista-cli"]
 
diff --git a/README.md b/README.md
index 95c202a2..88b438b5 100644
--- a/README.md
+++ b/README.md
@@ -65,7 +65,8 @@ The current initiatives being considered are:
 
 # Getting Started
 
-Refer to the core [Ballista crate README](ballista/rust/client/README.md) for the Getting Started guide.
+The easiest way to get started is to run one of the standalone or distributed [examples](./examples/README.md). After
+that, refer to the [Getting Started Guide](ballista/rust/client/README.md).
 
 ## Architecture Overview
 
diff --git a/ballista-examples/README.md b/ballista-examples/README.md
deleted file mode 100644
index 8dd83fc9..00000000
--- a/ballista-examples/README.md
+++ /dev/null
@@ -1,55 +0,0 @@
-<!---
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
--->
-
-# Ballista Examples
-
-This directory contains examples for executing distributed queries with Ballista.
-
-For background information on the Ballista architecture, refer to 
-the [Ballista README](../ballista/README.md).
-
-## Start a standalone cluster
-
-From the root of the arrow-datafusion project, build release binaries.
-
-```bash
-cargo build --release
-```
-
-Start a Ballista scheduler process in a new terminal session.
-
-```bash
-RUST_LOG=info ./target/release/ballista-scheduler
-```
-
-Start one or more Ballista executor processes in new terminal sessions. When starting more than one 
-executor, a unique port number must be specified for each executor.
-
-```bash
-RUST_LOG=info ./target/release/ballista-executor -c 4
-```
-
-## Running the examples
-
-The examples can be run using the `cargo run --bin` syntax. 
-
-```bash
-cargo run --release --bin ballista-dataframe
-```
-
diff --git a/ballista-examples/Cargo.toml b/examples/Cargo.toml
similarity index 96%
rename from ballista-examples/Cargo.toml
rename to examples/Cargo.toml
index 6a26ce73..a54cf41d 100644
--- a/ballista-examples/Cargo.toml
+++ b/examples/Cargo.toml
@@ -28,10 +28,9 @@ edition = "2021"
 publish = false
 rust-version = "1.59"
 
-
 [[example]]
-name = "test_sql"
-path = "examples/test_sql.rs"
+name = "standalone_sql"
+path = "examples/standalone-sql.rs"
 required-features = ["ballista/standalone"]
 
 [dependencies]
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 00000000..1fb29bf9
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,153 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Ballista Examples
+
+This directory contains examples for executing distributed queries with Ballista.
+
+# Standalone Examples
+
+The standalone example is the easiest to get started with. Ballista supports a standalone mode where a scheduler 
+and executor are started in-process.
+
+```bash
+cargo run --example standalone_sql --features="ballista/standalone"
+```
+
+### Source code for standalone SQL example
+
+```rust
+#[tokio::main]
+async fn main() -> Result<()> {
+    let config = BallistaConfig::builder()
+        .set("ballista.shuffle.partitions", "1")
+        .build()?;
+
+    let ctx = BallistaContext::standalone(&config, 2).await?;
+
+    ctx.register_csv(
+        "test",
+        "testdata/aggregate_test_100.csv",
+        CsvReadOptions::new(),
+    )
+    .await?;
+
+    let df = ctx.sql("select count(1) from test").await?;
+
+    df.show().await?;
+    Ok(())
+}
+
+```
+
+
+# Distributed Examples
+
+For background information on the Ballista architecture, refer to
+the [Ballista README](../ballista/rust/client/README.md).
+
+## Start a standalone cluster
+
+From the root of the project, build release binaries.
+
+```bash
+cargo build --release
+```
+
+Start a Ballista scheduler process in a new terminal session.
+
+```bash
+RUST_LOG=info ./target/release/ballista-scheduler
+```
+
+Start one or more Ballista executor processes in new terminal sessions. When starting more than one 
+executor, a unique port number must be specified for each executor.
+
+```bash
+RUST_LOG=info ./target/release/ballista-executor -c 2 -p 50051
+RUST_LOG=info ./target/release/ballista-executor -c 2 -p 50052
+```
+
+## Running the examples
+
+The examples can be run using the `cargo run --bin` syntax. 
+
+## Distributed SQL Example
+
+```bash
+cargo run --release --bin sql
+```
+
+### Source code for distributed SQL example
+
+```rust
+#[tokio::main]
+async fn main() -> Result<()> {
+    let config = BallistaConfig::builder()
+        .set("ballista.shuffle.partitions", "4")
+        .build()?;
+    let ctx = BallistaContext::remote("localhost", 50050, &config).await?;
+
+    let filename = "testdata/alltypes_plain.parquet";
+
+    // define the query using the DataFrame trait
+    let df = ctx
+        .read_parquet(filename, ParquetReadOptions::default())
+        .await?
+        .select_columns(&["id", "bool_col", "timestamp_col"])?
+        .filter(col("id").gt(lit(1)))?;
+
+    // print the results
+    df.show().await?;
+
+    Ok(())
+}
+```
+
+## Distributed DataFrame Example
+
+```bash
+cargo run --release --bin dataframe
+```
+
+### Source code for distributed DataFrame example
+
+```rust
+#[tokio::main]
+async fn main() -> Result<()> {
+    let config = BallistaConfig::builder()
+        .set("ballista.shuffle.partitions", "4")
+        .build()?;
+    let ctx = BallistaContext::remote("localhost", 50050, &config).await?;
+
+    let filename = "testdata/alltypes_plain.parquet";
+
+    // define the query using the DataFrame trait
+    let df = ctx
+        .read_parquet(filename, ParquetReadOptions::default())
+        .await?
+        .select_columns(&["id", "bool_col", "timestamp_col"])?
+        .filter(col("id").gt(lit(1)))?;
+
+    // print the results
+    df.show().await?;
+
+    Ok(())
+}
+```
\ No newline at end of file
diff --git a/ballista-examples/examples/test_sql.rs b/examples/examples/standalone-sql.rs
similarity index 84%
rename from ballista-examples/examples/test_sql.rs
rename to examples/examples/standalone-sql.rs
index 4220e4aa..1874be3c 100644
--- a/ballista-examples/examples/test_sql.rs
+++ b/examples/examples/standalone-sql.rs
@@ -18,26 +18,23 @@
 use ballista::prelude::{BallistaConfig, BallistaContext, Result};
 use datafusion::prelude::CsvReadOptions;
 
-/// This example show the udf plugin is work
-///
 #[tokio::main]
 async fn main() -> Result<()> {
     let config = BallistaConfig::builder()
         .set("ballista.shuffle.partitions", "1")
         .build()?;
 
-    let ctx = BallistaContext::standalone(&config, 10).await.unwrap();
+    let ctx = BallistaContext::standalone(&config, 2).await?;
 
     // register csv file with the execution context
     ctx.register_csv(
-        "aggregate_test_100",
+        "test",
         "testdata/aggregate_test_100.csv",
         CsvReadOptions::new(),
     )
     .await?;
 
-    // test udf
-    let df = ctx.sql("select count(1) from aggregate_test_100").await?;
+    let df = ctx.sql("select count(1) from test").await?;
 
     df.show().await?;
     Ok(())
diff --git a/ballista-examples/src/bin/ballista-dataframe.rs b/examples/src/bin/dataframe.rs
similarity index 100%
rename from ballista-examples/src/bin/ballista-dataframe.rs
rename to examples/src/bin/dataframe.rs
diff --git a/ballista-examples/src/bin/ballista-sql.rs b/examples/src/bin/sql.rs
similarity index 96%
rename from ballista-examples/src/bin/ballista-sql.rs
rename to examples/src/bin/sql.rs
index a8209564..f8afad56 100644
--- a/ballista-examples/src/bin/ballista-sql.rs
+++ b/examples/src/bin/sql.rs
@@ -29,7 +29,7 @@ async fn main() -> Result<()> {
 
     // register csv file with the execution context
     ctx.register_csv(
-        "aggregate_test_100",
+        "test",
         "testdata/aggregate_test_100.csv",
         CsvReadOptions::new(),
     )
@@ -39,7 +39,7 @@ async fn main() -> Result<()> {
     let df = ctx
         .sql(
             "SELECT c1, MIN(c12), MAX(c12) \
-        FROM aggregate_test_100 \
+        FROM test \
         WHERE c11 > 0.1 AND c11 < 0.9 \
         GROUP BY c1",
         )
diff --git a/ballista-examples/testdata/aggregate_test_100.csv b/examples/testdata/aggregate_test_100.csv
similarity index 100%
rename from ballista-examples/testdata/aggregate_test_100.csv
rename to examples/testdata/aggregate_test_100.csv
diff --git a/ballista-examples/testdata/alltypes_plain.parquet b/examples/testdata/alltypes_plain.parquet
similarity index 100%
rename from ballista-examples/testdata/alltypes_plain.parquet
rename to examples/testdata/alltypes_plain.parquet