You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ag...@apache.org on 2022/05/24 04:27:34 UTC
[arrow-ballista] branch master updated: MINOR: Improve the examples (#34)
This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-ballista.git
The following commit(s) were added to refs/heads/master by this push:
new 30d21579 MINOR: Improve the examples (#34)
30d21579 is described below
commit 30d2157920d418cbc1b9a58eb6a2819c8e094950
Author: Andy Grove <an...@gmail.com>
AuthorDate: Mon May 23 22:27:30 2022 -0600
MINOR: Improve the examples (#34)
---
.github/workflows/rust.yml | 4 +-
Cargo.toml | 2 +-
README.md | 3 +-
ballista-examples/README.md | 55 --------
{ballista-examples => examples}/Cargo.toml | 5 +-
examples/README.md | 153 +++++++++++++++++++++
.../examples/standalone-sql.rs | 9 +-
.../src/bin/dataframe.rs | 0
.../bin/ballista-sql.rs => examples/src/bin/sql.rs | 4 +-
.../testdata/aggregate_test_100.csv | 0
.../testdata/alltypes_plain.parquet | Bin
11 files changed, 165 insertions(+), 70 deletions(-)
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index b2181222..e8ffab29 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -110,8 +110,8 @@ jobs:
export ARROW_TEST_DATA=$(pwd)/testing/data
export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data
cargo test
- cd ballista-examples
- cargo run --example test_sql --features=ballista/standalone
+ cd examples
+ cargo run --example standalone_sql --features=ballista/standalone
env:
CARGO_HOME: "/github/home/.cargo"
CARGO_TARGET_DIR: "/github/home/target"
diff --git a/Cargo.toml b/Cargo.toml
index 2e2de363..c54fc112 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,7 +22,7 @@ members = [
"ballista/rust/core",
"ballista/rust/executor",
"ballista/rust/scheduler",
- "ballista-examples",
+ "examples",
]
exclude = ["ballista-cli"]
diff --git a/README.md b/README.md
index 95c202a2..88b438b5 100644
--- a/README.md
+++ b/README.md
@@ -65,7 +65,8 @@ The current initiatives being considered are:
# Getting Started
-Refer to the core [Ballista crate README](ballista/rust/client/README.md) for the Getting Started guide.
+The easiest way to get started is to run one of the standalone or distributed [examples](./examples/README.md). After
+that, refer to the [Getting Started Guide](ballista/rust/client/README.md).
## Architecture Overview
diff --git a/ballista-examples/README.md b/ballista-examples/README.md
deleted file mode 100644
index 8dd83fc9..00000000
--- a/ballista-examples/README.md
+++ /dev/null
@@ -1,55 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-# Ballista Examples
-
-This directory contains examples for executing distributed queries with Ballista.
-
-For background information on the Ballista architecture, refer to
-the [Ballista README](../ballista/README.md).
-
-## Start a standalone cluster
-
-From the root of the arrow-datafusion project, build release binaries.
-
-```bash
-cargo build --release
-```
-
-Start a Ballista scheduler process in a new terminal session.
-
-```bash
-RUST_LOG=info ./target/release/ballista-scheduler
-```
-
-Start one or more Ballista executor processes in new terminal sessions. When starting more than one
-executor, a unique port number must be specified for each executor.
-
-```bash
-RUST_LOG=info ./target/release/ballista-executor -c 4
-```
-
-## Running the examples
-
-The examples can be run using the `cargo run --bin` syntax.
-
-```bash
-cargo run --release --bin ballista-dataframe
-```
-
diff --git a/ballista-examples/Cargo.toml b/examples/Cargo.toml
similarity index 96%
rename from ballista-examples/Cargo.toml
rename to examples/Cargo.toml
index 6a26ce73..a54cf41d 100644
--- a/ballista-examples/Cargo.toml
+++ b/examples/Cargo.toml
@@ -28,10 +28,9 @@ edition = "2021"
publish = false
rust-version = "1.59"
-
[[example]]
-name = "test_sql"
-path = "examples/test_sql.rs"
+name = "standalone_sql"
+path = "examples/standalone-sql.rs"
required-features = ["ballista/standalone"]
[dependencies]
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 00000000..1fb29bf9
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,153 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# Ballista Examples
+
+This directory contains examples for executing distributed queries with Ballista.
+
+# Standalone Examples
+
+The standalone example is the easiest to get started with. Ballista supports a standalone mode where a scheduler
+and executor are started in-process.
+
+```bash
+cargo run --example standalone_sql --features="ballista/standalone"
+```
+
+### Source code for standalone SQL example
+
+```rust
+#[tokio::main]
+async fn main() -> Result<()> {
+ let config = BallistaConfig::builder()
+ .set("ballista.shuffle.partitions", "1")
+ .build()?;
+
+ let ctx = BallistaContext::standalone(&config, 2).await?;
+
+ ctx.register_csv(
+ "test",
+ "testdata/aggregate_test_100.csv",
+ CsvReadOptions::new(),
+ )
+ .await?;
+
+ let df = ctx.sql("select count(1) from test").await?;
+
+ df.show().await?;
+ Ok(())
+}
+
+```
+
+
+# Distributed Examples
+
+For background information on the Ballista architecture, refer to
+the [Ballista README](../ballista/rust/client/README.md).
+
+## Start a standalone cluster
+
+From the root of the project, build release binaries.
+
+```bash
+cargo build --release
+```
+
+Start a Ballista scheduler process in a new terminal session.
+
+```bash
+RUST_LOG=info ./target/release/ballista-scheduler
+```
+
+Start one or more Ballista executor processes in new terminal sessions. When starting more than one
+executor, a unique port number must be specified for each executor.
+
+```bash
+RUST_LOG=info ./target/release/ballista-executor -c 2 -p 50051
+RUST_LOG=info ./target/release/ballista-executor -c 2 -p 50052
+```
+
+## Running the examples
+
+The examples can be run using the `cargo run --bin` syntax.
+
+## Distributed SQL Example
+
+```bash
+cargo run --release --bin sql
+```
+
+### Source code for distributed SQL example
+
+```rust
+#[tokio::main]
+async fn main() -> Result<()> {
+ let config = BallistaConfig::builder()
+ .set("ballista.shuffle.partitions", "4")
+ .build()?;
+ let ctx = BallistaContext::remote("localhost", 50050, &config).await?;
+
+ let filename = "testdata/alltypes_plain.parquet";
+
+ // define the query using the DataFrame trait
+ let df = ctx
+ .read_parquet(filename, ParquetReadOptions::default())
+ .await?
+ .select_columns(&["id", "bool_col", "timestamp_col"])?
+ .filter(col("id").gt(lit(1)))?;
+
+ // print the results
+ df.show().await?;
+
+ Ok(())
+}
+```
+
+## Distributed DataFrame Example
+
+```bash
+cargo run --release --bin dataframe
+```
+
+### Source code for distributed DataFrame example
+
+```rust
+#[tokio::main]
+async fn main() -> Result<()> {
+ let config = BallistaConfig::builder()
+ .set("ballista.shuffle.partitions", "4")
+ .build()?;
+ let ctx = BallistaContext::remote("localhost", 50050, &config).await?;
+
+ let filename = "testdata/alltypes_plain.parquet";
+
+ // define the query using the DataFrame trait
+ let df = ctx
+ .read_parquet(filename, ParquetReadOptions::default())
+ .await?
+ .select_columns(&["id", "bool_col", "timestamp_col"])?
+ .filter(col("id").gt(lit(1)))?;
+
+ // print the results
+ df.show().await?;
+
+ Ok(())
+}
+```
\ No newline at end of file
diff --git a/ballista-examples/examples/test_sql.rs b/examples/examples/standalone-sql.rs
similarity index 84%
rename from ballista-examples/examples/test_sql.rs
rename to examples/examples/standalone-sql.rs
index 4220e4aa..1874be3c 100644
--- a/ballista-examples/examples/test_sql.rs
+++ b/examples/examples/standalone-sql.rs
@@ -18,26 +18,23 @@
use ballista::prelude::{BallistaConfig, BallistaContext, Result};
use datafusion::prelude::CsvReadOptions;
-/// This example show the udf plugin is work
-///
#[tokio::main]
async fn main() -> Result<()> {
let config = BallistaConfig::builder()
.set("ballista.shuffle.partitions", "1")
.build()?;
- let ctx = BallistaContext::standalone(&config, 10).await.unwrap();
+ let ctx = BallistaContext::standalone(&config, 2).await?;
// register csv file with the execution context
ctx.register_csv(
- "aggregate_test_100",
+ "test",
"testdata/aggregate_test_100.csv",
CsvReadOptions::new(),
)
.await?;
- // test udf
- let df = ctx.sql("select count(1) from aggregate_test_100").await?;
+ let df = ctx.sql("select count(1) from test").await?;
df.show().await?;
Ok(())
diff --git a/ballista-examples/src/bin/ballista-dataframe.rs b/examples/src/bin/dataframe.rs
similarity index 100%
rename from ballista-examples/src/bin/ballista-dataframe.rs
rename to examples/src/bin/dataframe.rs
diff --git a/ballista-examples/src/bin/ballista-sql.rs b/examples/src/bin/sql.rs
similarity index 96%
rename from ballista-examples/src/bin/ballista-sql.rs
rename to examples/src/bin/sql.rs
index a8209564..f8afad56 100644
--- a/ballista-examples/src/bin/ballista-sql.rs
+++ b/examples/src/bin/sql.rs
@@ -29,7 +29,7 @@ async fn main() -> Result<()> {
// register csv file with the execution context
ctx.register_csv(
- "aggregate_test_100",
+ "test",
"testdata/aggregate_test_100.csv",
CsvReadOptions::new(),
)
@@ -39,7 +39,7 @@ async fn main() -> Result<()> {
let df = ctx
.sql(
"SELECT c1, MIN(c12), MAX(c12) \
- FROM aggregate_test_100 \
+ FROM test \
WHERE c11 > 0.1 AND c11 < 0.9 \
GROUP BY c1",
)
diff --git a/ballista-examples/testdata/aggregate_test_100.csv b/examples/testdata/aggregate_test_100.csv
similarity index 100%
rename from ballista-examples/testdata/aggregate_test_100.csv
rename to examples/testdata/aggregate_test_100.csv
diff --git a/ballista-examples/testdata/alltypes_plain.parquet b/examples/testdata/alltypes_plain.parquet
similarity index 100%
rename from ballista-examples/testdata/alltypes_plain.parquet
rename to examples/testdata/alltypes_plain.parquet