You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by jo...@apache.org on 2021/04/18 14:40:24 UTC
[arrow-rs] 14/14: Removed DataFusion and Ballista.
This is an automated email from the ASF dual-hosted git repository.
jorgecarleitao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
commit a889ebac9d616b50426a55210e12c21697e7c43a
Author: Jorge C. Leitao <jo...@gmail.com>
AuthorDate: Sun Apr 18 14:27:44 2021 +0000
Removed DataFusion and Ballista.
---
.dockerignore | 3 -
.github/workflows/dev_pr/labeler.yml | 6 -
rust/ballista/.dockerignore | 18 -
rust/ballista/README.md | 64 -
rust/ballista/dev/build-rust-base.sh | 21 -
rust/ballista/dev/build-rust.sh | 24 -
rust/ballista/dev/integration-tests.sh | 28 -
rust/ballista/docker/README.md | 29 -
rust/ballista/docker/rust-base.dockerfile | 99 -
rust/ballista/docker/rust.dockerfile | 71 -
rust/ballista/docs/README.md | 37 -
rust/ballista/docs/architecture.md | 75 -
rust/ballista/docs/dev-env-rust.md | 38 -
rust/ballista/docs/images/query-execution.png | Bin 11378 -> 0 bytes
rust/ballista/docs/integration-testing.md | 32 -
rust/ballista/docs/release-process.md | 68 -
rust/ballista/docs/rust-docker.md | 66 -
rust/ballista/docs/user-guide/.gitignore | 2 -
rust/ballista/docs/user-guide/README.md | 36 -
rust/ballista/docs/user-guide/book.toml | 23 -
rust/ballista/docs/user-guide/src/SUMMARY.md | 30 -
rust/ballista/docs/user-guide/src/client-rust.md | 22 -
rust/ballista/docs/user-guide/src/clients.md | 22 -
rust/ballista/docs/user-guide/src/configuration.md | 32 -
rust/ballista/docs/user-guide/src/deployment.md | 26 -
.../ballista/docs/user-guide/src/docker-compose.md | 55 -
rust/ballista/docs/user-guide/src/faq.md | 31 -
.../user-guide/src/img/ballista-architecture.png | Bin 21225 -> 0 bytes
rust/ballista/docs/user-guide/src/introduction.md | 52 -
rust/ballista/docs/user-guide/src/kubernetes.md | 216 -
rust/ballista/docs/user-guide/src/standalone.md | 92 -
rust/ballista/rust/.dockerignore | 23 -
rust/ballista/rust/.gitignore | 2 -
rust/ballista/rust/Cargo.toml | 30 -
rust/ballista/rust/benchmarks/tpch/.dockerignore | 25 -
rust/ballista/rust/benchmarks/tpch/.gitignore | 1 -
rust/ballista/rust/benchmarks/tpch/Cargo.toml | 36 -
rust/ballista/rust/benchmarks/tpch/README.md | 104 -
.../rust/benchmarks/tpch/docker-compose.yaml | 62 -
rust/ballista/rust/benchmarks/tpch/entrypoint.sh | 22 -
rust/ballista/rust/benchmarks/tpch/queries/q1.sql | 21 -
rust/ballista/rust/benchmarks/tpch/queries/q10.sql | 31 -
rust/ballista/rust/benchmarks/tpch/queries/q11.sql | 27 -
rust/ballista/rust/benchmarks/tpch/queries/q12.sql | 30 -
rust/ballista/rust/benchmarks/tpch/queries/q13.sql | 20 -
rust/ballista/rust/benchmarks/tpch/queries/q14.sql | 13 -
rust/ballista/rust/benchmarks/tpch/queries/q16.sql | 30 -
rust/ballista/rust/benchmarks/tpch/queries/q17.sql | 17 -
rust/ballista/rust/benchmarks/tpch/queries/q18.sql | 32 -
rust/ballista/rust/benchmarks/tpch/queries/q19.sql | 35 -
rust/ballista/rust/benchmarks/tpch/queries/q2.sql | 43 -
rust/ballista/rust/benchmarks/tpch/queries/q20.sql | 37 -
rust/ballista/rust/benchmarks/tpch/queries/q21.sql | 39 -
rust/ballista/rust/benchmarks/tpch/queries/q22.sql | 37 -
rust/ballista/rust/benchmarks/tpch/queries/q3.sql | 22 -
rust/ballista/rust/benchmarks/tpch/queries/q4.sql | 21 -
rust/ballista/rust/benchmarks/tpch/queries/q5.sql | 24 -
rust/ballista/rust/benchmarks/tpch/queries/q6.sql | 9 -
rust/ballista/rust/benchmarks/tpch/queries/q7.sql | 39 -
rust/ballista/rust/benchmarks/tpch/queries/q8.sql | 37 -
rust/ballista/rust/benchmarks/tpch/queries/q9.sql | 32 -
rust/ballista/rust/benchmarks/tpch/run.sh | 25 -
rust/ballista/rust/benchmarks/tpch/src/main.rs | 360 -
rust/ballista/rust/benchmarks/tpch/tpch-gen.sh | 33 -
.../rust/benchmarks/tpch/tpchgen.dockerfile | 32 -
rust/ballista/rust/client/Cargo.toml | 35 -
rust/ballista/rust/client/README.md | 22 -
rust/ballista/rust/client/src/columnar_batch.rs | 167 -
rust/ballista/rust/client/src/context.rs | 400 -
rust/ballista/rust/client/src/lib.rs | 20 -
rust/ballista/rust/client/src/prelude.rs | 23 -
rust/ballista/rust/core/Cargo.toml | 50 -
rust/ballista/rust/core/README.md | 21 -
rust/ballista/rust/core/build.rs | 26 -
rust/ballista/rust/core/proto/ballista.proto | 824 --
rust/ballista/rust/core/src/client.rs | 224 -
rust/ballista/rust/core/src/datasource.rs | 72 -
rust/ballista/rust/core/src/error.rs | 172 -
rust/ballista/rust/core/src/execution_plans/mod.rs | 27 -
.../rust/core/src/execution_plans/query_stage.rs | 92 -
.../core/src/execution_plans/shuffle_reader.rs | 106 -
.../core/src/execution_plans/unresolved_shuffle.rs | 101 -
rust/ballista/rust/core/src/lib.rs | 34 -
rust/ballista/rust/core/src/memory_stream.rs | 93 -
.../rust/core/src/serde/logical_plan/from_proto.rs | 1200 --
.../rust/core/src/serde/logical_plan/mod.rs | 929 --
.../rust/core/src/serde/logical_plan/to_proto.rs | 1233 --
rust/ballista/rust/core/src/serde/mod.rs | 69 -
.../core/src/serde/physical_plan/from_proto.rs | 398 -
.../rust/core/src/serde/physical_plan/mod.rs | 178 -
.../rust/core/src/serde/physical_plan/to_proto.rs | 556 -
.../rust/core/src/serde/scheduler/from_proto.rs | 124 -
rust/ballista/rust/core/src/serde/scheduler/mod.rs | 262 -
.../rust/core/src/serde/scheduler/to_proto.rs | 90 -
rust/ballista/rust/core/src/utils.rs | 327 -
rust/ballista/rust/executor/Cargo.toml | 59 -
rust/ballista/rust/executor/README.md | 31 -
rust/ballista/rust/executor/build.rs | 24 -
.../executor/examples/example_executor_config.toml | 22 -
.../rust/executor/executor_config_spec.toml | 79 -
rust/ballista/rust/executor/src/collect.rs | 127 -
rust/ballista/rust/executor/src/execution_loop.rs | 172 -
rust/ballista/rust/executor/src/flight_service.rs | 374 -
rust/ballista/rust/executor/src/lib.rs | 52 -
rust/ballista/rust/executor/src/main.rs | 176 -
rust/ballista/rust/scheduler/Cargo.toml | 66 -
rust/ballista/rust/scheduler/README.md | 51 -
rust/ballista/rust/scheduler/build.rs | 24 -
.../rust/scheduler/scheduler_config_spec.toml | 60 -
rust/ballista/rust/scheduler/src/api/handlers.rs | 55 -
rust/ballista/rust/scheduler/src/api/mod.rs | 87 -
rust/ballista/rust/scheduler/src/lib.rs | 490 -
rust/ballista/rust/scheduler/src/main.rs | 156 -
rust/ballista/rust/scheduler/src/planner.rs | 494 -
rust/ballista/rust/scheduler/src/state/etcd.rs | 205 -
rust/ballista/rust/scheduler/src/state/mod.rs | 880 --
.../rust/scheduler/src/state/standalone.rs | 228 -
rust/ballista/rust/scheduler/src/test_utils.rs | 148 -
.../rust/scheduler/testdata/customer/customer.tbl | 10 -
.../scheduler/testdata/lineitem/partition0.tbl | 10 -
.../scheduler/testdata/lineitem/partition1.tbl | 10 -
.../rust/scheduler/testdata/nation/nation.tbl | 10 -
.../rust/scheduler/testdata/orders/orders.tbl | 10 -
.../ballista/rust/scheduler/testdata/part/part.tbl | 10 -
.../rust/scheduler/testdata/partsupp/partsupp.tbl | 10 -
.../rust/scheduler/testdata/region/region.tbl | 5 -
.../rust/scheduler/testdata/supplier/supplier.tbl | 10 -
rust/ballista/ui/scheduler/.gitignore | 23 -
rust/ballista/ui/scheduler/README.md | 45 -
rust/ballista/ui/scheduler/index.d.ts | 18 -
rust/ballista/ui/scheduler/package.json | 58 -
rust/ballista/ui/scheduler/public/favicon.ico | Bin 3870 -> 0 bytes
rust/ballista/ui/scheduler/public/index.html | 62 -
rust/ballista/ui/scheduler/public/logo192.png | Bin 5347 -> 0 bytes
rust/ballista/ui/scheduler/public/logo512.png | Bin 9664 -> 0 bytes
rust/ballista/ui/scheduler/public/manifest.json | 25 -
rust/ballista/ui/scheduler/public/robots.txt | 20 -
rust/ballista/ui/scheduler/react-table-config.d.ts | 137 -
rust/ballista/ui/scheduler/src/App.css | 18 -
rust/ballista/ui/scheduler/src/App.test.tsx | 26 -
rust/ballista/ui/scheduler/src/App.tsx | 97 -
.../ui/scheduler/src/components/DataTable.tsx | 131 -
.../ballista/ui/scheduler/src/components/Empty.tsx | 36 -
.../ui/scheduler/src/components/Footer.tsx | 28 -
.../ui/scheduler/src/components/Header.tsx | 82 -
.../ui/scheduler/src/components/NodesList.tsx | 71 -
.../ui/scheduler/src/components/QueriesList.tsx | 115 -
.../ui/scheduler/src/components/Summary.tsx | 89 -
rust/ballista/ui/scheduler/src/components/logo.svg | 25 -
rust/ballista/ui/scheduler/src/index.css | 32 -
rust/ballista/ui/scheduler/src/index.tsx | 38 -
rust/ballista/ui/scheduler/src/react-app-env.d.ts | 18 -
rust/ballista/ui/scheduler/src/reportWebVitals.ts | 32 -
rust/ballista/ui/scheduler/src/setupTests.ts | 22 -
rust/ballista/ui/scheduler/tsconfig.json | 28 -
rust/ballista/ui/scheduler/yarn.lock | 12431 -------------------
rust/benchmarks/Cargo.toml | 42 -
rust/benchmarks/README.md | 120 -
rust/benchmarks/src/bin/nyctaxi.rs | 151 -
rust/benchmarks/src/bin/tpch.rs | 1692 ---
rust/datafusion-examples/Cargo.toml | 39 -
rust/datafusion-examples/examples/README.md | 28 -
rust/datafusion-examples/examples/csv_sql.rs | 52 -
rust/datafusion-examples/examples/dataframe.rs | 47 -
.../examples/dataframe_in_memory.rs | 67 -
rust/datafusion-examples/examples/flight_client.rs | 79 -
rust/datafusion-examples/examples/flight_server.rs | 213 -
rust/datafusion-examples/examples/parquet_sql.rs | 50 -
rust/datafusion-examples/examples/simple_udaf.rs | 170 -
rust/datafusion-examples/examples/simple_udf.rs | 151 -
rust/datafusion/Cargo.toml | 99 -
rust/datafusion/DEVELOPERS.md | 92 -
rust/datafusion/Dockerfile | 25 -
rust/datafusion/README.md | 356 -
rust/datafusion/benches/aggregate_query_sql.rs | 248 -
rust/datafusion/benches/filter_query_sql.rs | 91 -
rust/datafusion/benches/math_query_sql.rs | 111 -
rust/datafusion/benches/scalar.rs | 30 -
rust/datafusion/benches/sort_limit_query_sql.rs | 148 -
rust/datafusion/docs/cli.md | 95 -
.../docs/images/DataFusion-Logo-Dark.png | Bin 20134 -> 0 bytes
.../docs/images/DataFusion-Logo-Dark.svg | 1 -
.../docs/images/DataFusion-Logo-Light.png | Bin 19102 -> 0 bytes
.../docs/images/DataFusion-Logo-Light.svg | 1 -
rust/datafusion/src/bin/main.rs | 25 -
rust/datafusion/src/bin/repl.rs | 140 -
rust/datafusion/src/catalog/catalog.rs | 139 -
rust/datafusion/src/catalog/information_schema.rs | 492 -
rust/datafusion/src/catalog/mod.rs | 146 -
rust/datafusion/src/catalog/schema.rs | 104 -
rust/datafusion/src/dataframe.rs | 286 -
rust/datafusion/src/datasource/csv.rs | 144 -
rust/datafusion/src/datasource/datasource.rs | 103 -
rust/datafusion/src/datasource/empty.rs | 80 -
rust/datafusion/src/datasource/memory.rs | 472 -
rust/datafusion/src/datasource/mod.rs | 28 -
rust/datafusion/src/datasource/parquet.rs | 373 -
rust/datafusion/src/error.rs | 120 -
rust/datafusion/src/execution/context.rs | 3123 -----
rust/datafusion/src/execution/dataframe_impl.rs | 374 -
rust/datafusion/src/execution/mod.rs | 21 -
rust/datafusion/src/lib.rs | 211 -
rust/datafusion/src/logical_plan/builder.rs | 595 -
rust/datafusion/src/logical_plan/dfschema.rs | 521 -
rust/datafusion/src/logical_plan/display.rs | 270 -
rust/datafusion/src/logical_plan/expr.rs | 1505 ---
rust/datafusion/src/logical_plan/extension.rs | 79 -
rust/datafusion/src/logical_plan/mod.rs | 50 -
rust/datafusion/src/logical_plan/operators.rs | 135 -
rust/datafusion/src/logical_plan/plan.rs | 1095 --
rust/datafusion/src/logical_plan/registry.rs | 34 -
rust/datafusion/src/optimizer/constant_folding.rs | 591 -
rust/datafusion/src/optimizer/filter_push_down.rs | 1021 --
.../src/optimizer/hash_build_probe_order.rs | 257 -
rust/datafusion/src/optimizer/limit_push_down.rs | 252 -
rust/datafusion/src/optimizer/mod.rs | 27 -
rust/datafusion/src/optimizer/optimizer.rs | 32 -
.../src/optimizer/projection_push_down.rs | 542 -
rust/datafusion/src/optimizer/utils.rs | 489 -
.../src/physical_optimizer/coalesce_batches.rs | 88 -
.../src/physical_optimizer/merge_exec.rs | 74 -
rust/datafusion/src/physical_optimizer/mod.rs | 24 -
.../datafusion/src/physical_optimizer/optimizer.rs | 39 -
.../src/physical_optimizer/repartition.rs | 186 -
rust/datafusion/src/physical_plan/aggregates.rs | 258 -
.../src/physical_plan/array_expressions.rs | 127 -
.../src/physical_plan/coalesce_batches.rs | 316 -
rust/datafusion/src/physical_plan/common.rs | 104 -
.../src/physical_plan/crypto_expressions.rs | 198 -
rust/datafusion/src/physical_plan/csv.rs | 401 -
.../src/physical_plan/datetime_expressions.rs | 559 -
.../src/physical_plan/distinct_expressions.rs | 557 -
rust/datafusion/src/physical_plan/empty.rs | 186 -
rust/datafusion/src/physical_plan/explain.rs | 125 -
.../src/physical_plan/expressions/average.rs | 293 -
.../src/physical_plan/expressions/binary.rs | 1101 --
.../src/physical_plan/expressions/case.rs | 597 -
.../src/physical_plan/expressions/cast.rs | 301 -
.../src/physical_plan/expressions/coercion.rs | 208 -
.../src/physical_plan/expressions/column.rs | 86 -
.../src/physical_plan/expressions/count.rs | 235 -
.../src/physical_plan/expressions/in_list.rs | 458 -
.../src/physical_plan/expressions/is_not_null.rs | 119 -
.../src/physical_plan/expressions/is_null.rs | 119 -
.../src/physical_plan/expressions/literal.rs | 108 -
.../src/physical_plan/expressions/min_max.rs | 655 -
.../src/physical_plan/expressions/mod.rs | 135 -
.../src/physical_plan/expressions/negative.rs | 133 -
.../src/physical_plan/expressions/not.rs | 158 -
.../src/physical_plan/expressions/nullif.rs | 188 -
.../src/physical_plan/expressions/sum.rs | 373 -
.../src/physical_plan/expressions/try_cast.rs | 247 -
rust/datafusion/src/physical_plan/filter.rs | 240 -
rust/datafusion/src/physical_plan/functions.rs | 3767 ------
rust/datafusion/src/physical_plan/group_scalar.rs | 212 -
.../datafusion/src/physical_plan/hash_aggregate.rs | 1395 ---
rust/datafusion/src/physical_plan/hash_join.rs | 1265 --
rust/datafusion/src/physical_plan/hash_utils.rs | 201 -
rust/datafusion/src/physical_plan/limit.rs | 338 -
.../src/physical_plan/math_expressions.rs | 118 -
rust/datafusion/src/physical_plan/memory.rs | 161 -
rust/datafusion/src/physical_plan/merge.rs | 225 -
rust/datafusion/src/physical_plan/mod.rs | 369 -
rust/datafusion/src/physical_plan/parquet.rs | 1535 ---
rust/datafusion/src/physical_plan/planner.rs | 1106 --
rust/datafusion/src/physical_plan/projection.rs | 232 -
.../src/physical_plan/regex_expressions.rs | 172 -
rust/datafusion/src/physical_plan/repartition.rs | 461 -
rust/datafusion/src/physical_plan/sort.rs | 478 -
.../src/physical_plan/string_expressions.rs | 595 -
rust/datafusion/src/physical_plan/type_coercion.rs | 361 -
rust/datafusion/src/physical_plan/udaf.rs | 168 -
rust/datafusion/src/physical_plan/udf.rs | 112 -
.../src/physical_plan/unicode_expressions.rs | 532 -
rust/datafusion/src/physical_plan/union.rs | 143 -
rust/datafusion/src/prelude.rs | 37 -
rust/datafusion/src/scalar.rs | 821 --
rust/datafusion/src/sql/mod.rs | 23 -
rust/datafusion/src/sql/parser.rs | 380 -
rust/datafusion/src/sql/planner.rs | 2723 ----
rust/datafusion/src/sql/utils.rs | 376 -
rust/datafusion/src/test/exec.rs | 102 -
rust/datafusion/src/test/mod.rs | 346 -
rust/datafusion/src/test/user_defined.rs | 76 -
rust/datafusion/src/test/variable.rs | 58 -
rust/datafusion/src/variable/mod.rs | 36 -
rust/datafusion/tests/aggregate_simple.csv | 16 -
rust/datafusion/tests/custom_sources.rs | 200 -
rust/datafusion/tests/customer.csv | 4 -
rust/datafusion/tests/dataframe.rs | 79 -
rust/datafusion/tests/example.csv | 2 -
rust/datafusion/tests/provider_filter_pushdown.rs | 177 -
rust/datafusion/tests/sql.rs | 2707 ----
rust/datafusion/tests/user_defined_plan.rs | 512 -
294 files changed, 78087 deletions(-)
diff --git a/.dockerignore b/.dockerignore
index eb71138..36732a2 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -49,7 +49,6 @@
!ruby/red-plasma/lib/plasma/version.rb
!ruby/red-plasma/red-plasma.gemspec
!rust/Cargo.toml
-!rust/benchmarks/Cargo.toml
!rust/arrow/Cargo.toml
!rust/arrow/benches
!rust/arrow-flight/Cargo.toml
@@ -57,6 +56,4 @@
!rust/parquet/build.rs
!rust/parquet_derive/Cargo.toml
!rust/parquet_derive_test/Cargo.toml
-!rust/datafusion/Cargo.toml
-!rust/datafusion/benches
!rust/integration-testing/Cargo.toml
diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml
index 098e1ba..65ca3f2 100644
--- a/.github/workflows/dev_pr/labeler.yml
+++ b/.github/workflows/dev_pr/labeler.yml
@@ -48,12 +48,6 @@ lang-ruby:
lang-rust:
- rust/**/*
-datafusion:
- - rust/datafusion/**/*
-
-ballista:
- - rust/ballista/**/*
-
flight:
- cpp/src/arrow/flight/**/*
- r/R/flight.*
diff --git a/rust/ballista/.dockerignore b/rust/ballista/.dockerignore
deleted file mode 100644
index 3cde49e..0000000
--- a/rust/ballista/.dockerignore
+++ /dev/null
@@ -1,18 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-rust/**/target
diff --git a/rust/ballista/README.md b/rust/ballista/README.md
deleted file mode 100644
index 288386f..0000000
--- a/rust/ballista/README.md
+++ /dev/null
@@ -1,64 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-# Ballista: Distributed Compute with Apache Arrow
-
-Ballista is a distributed compute platform primarily implemented in Rust, and powered by Apache Arrow. It is built
-on an architecture that allows other programming languages (such as Python, C++, and Java) to be supported as
-first-class citizens without paying a penalty for serialization costs.
-
-The foundational technologies in Ballista are:
-
-- [Apache Arrow](https://arrow.apache.org/) memory model and compute kernels for efficient processing of data.
-- [Apache Arrow Flight Protocol](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for efficient
- data transfer between processes.
-- [Google Protocol Buffers](https://developers.google.com/protocol-buffers) for serializing query plans.
-- [Docker](https://www.docker.com/) for packaging up executors along with user-defined code.
-
-Ballista can be deployed as a standalone cluster and also supports [Kubernetes](https://kubernetes.io/). In either
-case, the scheduler can be configured to use [etcd](https://etcd.io/) as a backing store to (eventually) provide
-redundancy in the case of a scheduler failing.
-
-# How does this compare to Apache Spark?
-
-Although Ballista is largely inspired by Apache Spark, there are some key differences.
-
-- The choice of Rust as the main execution language means that memory usage is deterministic and avoids the overhead of
- GC pauses.
-- Ballista is designed from the ground up to use columnar data, enabling a number of efficiencies such as vectorized
- processing (SIMD and GPU) and efficient compression. Although Spark does have some columnar support, it is still
- largely row-based today.
-- The combination of Rust and Arrow provides excellent memory efficiency and memory usage can be 5x - 10x lower than
- Apache Spark in some cases, which means that more processing can fit on a single node, reducing the overhead of
- distributed compute.
-- The use of Apache Arrow as the memory model and network protocol means that data can be exchanged between executors
- in any programming language with minimal serialization overhead.
-
-# Status
-
-The Ballista project was donated to Apache Arrow in April 2021 and work is underway to integrate more tightly with
-DataFusion.
-
-One of the goals is to implement a common scheduler that can seamlessly scale queries across cores in DataFusion and
-across nodes in Ballista.
-
-Ballista issues are tracked in ASF JIRA [here](https://issues.apache.org/jira/issues/?jql=project%20%3D%20ARROW%20AND%20component%20%3D%20%22Rust%20-%20Ballista%22)
-
-
-
diff --git a/rust/ballista/dev/build-rust-base.sh b/rust/ballista/dev/build-rust-base.sh
deleted file mode 100755
index ee4b32c..0000000
--- a/rust/ballista/dev/build-rust-base.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-BALLISTA_VERSION=0.4.2-SNAPSHOT
-set -e
-docker build -t ballistacompute/rust-base:$BALLISTA_VERSION -f docker/rust-base.dockerfile .
diff --git a/rust/ballista/dev/build-rust.sh b/rust/ballista/dev/build-rust.sh
deleted file mode 100755
index 1916f8e..0000000
--- a/rust/ballista/dev/build-rust.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-BALLISTA_VERSION=0.4.2-SNAPSHOT
-
-set -e
-
-docker build -t ballistacompute/ballista-rust:$BALLISTA_VERSION -f docker/rust.dockerfile .
diff --git a/rust/ballista/dev/integration-tests.sh b/rust/ballista/dev/integration-tests.sh
deleted file mode 100755
index cc34a5c..0000000
--- a/rust/ballista/dev/integration-tests.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-set -e
-./dev/build-rust.sh
-pushd rust/benchmarks/tpch
-./tpch-gen.sh
-
-docker-compose up -d
-docker-compose run ballista-client ./run.sh
-docker-compose down
-
-popd
diff --git a/rust/ballista/docker/README.md b/rust/ballista/docker/README.md
deleted file mode 100644
index 8417d04..0000000
--- a/rust/ballista/docker/README.md
+++ /dev/null
@@ -1,29 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-# Ballista Docker Images
-
-Pre-built docker images are available from [Docker Hub](https://hub.docker.com/orgs/ballistacompute/repositories) but here are the commands to build the images from source.
-
-Run these commands from the root directory of the project.
-
-```bash
-./dev/build-all.sh
-```
-
diff --git a/rust/ballista/docker/rust-base.dockerfile b/rust/ballista/docker/rust-base.dockerfile
deleted file mode 100644
index 4519225..0000000
--- a/rust/ballista/docker/rust-base.dockerfile
+++ /dev/null
@@ -1,99 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Turn .dockerignore to .dockerallow by excluding everything and explicitly
-# allowing specific files and directories. This enables us to quickly add
-# dependency files to the docker content without scanning the whole directory.
-# This setup requires to all of our docker containers have arrow's source
-# as a mounted directory.
-
-
-# Base image extends debian:buster-slim
-FROM rust:1.49.0-buster AS builder
-
-RUN apt update && apt -y install musl musl-dev musl-tools libssl-dev openssl
-
-#NOTE: the following was copied from https://github.com/emk/rust-musl-builder/blob/master/Dockerfile under Apache 2.0 license
-
-# The OpenSSL version to use. We parameterize this because many Rust
-# projects will fail to build with 1.1.
-#ARG OPENSSL_VERSION=1.0.2r
-ARG OPENSSL_VERSION=1.1.1b
-
-# Build a static library version of OpenSSL using musl-libc. This is needed by
-# the popular Rust `hyper` crate.
-#
-# We point /usr/local/musl/include/linux at some Linux kernel headers (not
-# necessarily the right ones) in an effort to compile OpenSSL 1.1's "engine"
-# component. It's possible that this will cause bizarre and terrible things to
-# happen. There may be "sanitized" header
-RUN echo "Building OpenSSL" && \
- ls /usr/include/linux && \
- mkdir -p /usr/local/musl/include && \
- ln -s /usr/include/linux /usr/local/musl/include/linux && \
- ln -s /usr/include/x86_64-linux-gnu/asm /usr/local/musl/include/asm && \
- ln -s /usr/include/asm-generic /usr/local/musl/include/asm-generic && \
- cd /tmp && \
- curl -LO "https://www.openssl.org/source/openssl-$OPENSSL_VERSION.tar.gz" && \
- tar xvzf "openssl-$OPENSSL_VERSION.tar.gz" && cd "openssl-$OPENSSL_VERSION" && \
- env CC=musl-gcc ./Configure no-shared no-zlib -fPIC --prefix=/usr/local/musl -DOPENSSL_NO_SECURE_MEMORY linux-x86_64 && \
- env C_INCLUDE_PATH=/usr/local/musl/include/ make depend && \
- env C_INCLUDE_PATH=/usr/local/musl/include/ make && \
- make install && \
- rm /usr/local/musl/include/linux /usr/local/musl/include/asm /usr/local/musl/include/asm-generic && \
- rm -r /tmp/*
-
-RUN echo "Building zlib" && \
- cd /tmp && \
- ZLIB_VERSION=1.2.11 && \
- curl -LO "http://zlib.net/zlib-$ZLIB_VERSION.tar.gz" && \
- tar xzf "zlib-$ZLIB_VERSION.tar.gz" && cd "zlib-$ZLIB_VERSION" && \
- CC=musl-gcc ./configure --static --prefix=/usr/local/musl && \
- make && make install && \
- rm -r /tmp/*
-
-RUN echo "Building libpq" && \
- cd /tmp && \
- POSTGRESQL_VERSION=11.2 && \
- curl -LO "https://ftp.postgresql.org/pub/source/v$POSTGRESQL_VERSION/postgresql-$POSTGRESQL_VERSION.tar.gz" && \
- tar xzf "postgresql-$POSTGRESQL_VERSION.tar.gz" && cd "postgresql-$POSTGRESQL_VERSION" && \
- CC=musl-gcc CPPFLAGS=-I/usr/local/musl/include LDFLAGS=-L/usr/local/musl/lib ./configure --with-openssl --without-readline --prefix=/usr/local/musl && \
- cd src/interfaces/libpq && make all-static-lib && make install-lib-static && \
- cd ../../bin/pg_config && make && make install && \
- rm -r /tmp/*
-
-ENV OPENSSL_DIR=/usr/local/musl/ \
- OPENSSL_INCLUDE_DIR=/usr/local/musl/include/ \
- DEP_OPENSSL_INCLUDE=/usr/local/musl/include/ \
- OPENSSL_LIB_DIR=/usr/local/musl/lib/ \
- OPENSSL_STATIC=1 \
- PQ_LIB_STATIC_X86_64_UNKNOWN_LINUX_MUSL=1 \
- PG_CONFIG_X86_64_UNKNOWN_LINUX_GNU=/usr/bin/pg_config \
- PKG_CONFIG_ALLOW_CROSS=true \
- PKG_CONFIG_ALL_STATIC=true \
- LIBZ_SYS_STATIC=1 \
- TARGET=musl
-
-# The content copied mentioned in the NOTE above ends here.
-
-## Download the target for static linking.
-RUN rustup target add x86_64-unknown-linux-musl
-RUN cargo install cargo-build-deps
-
-# prepare toolchain
-RUN rustup update && \
- rustup component add rustfmt
\ No newline at end of file
diff --git a/rust/ballista/docker/rust.dockerfile b/rust/ballista/docker/rust.dockerfile
deleted file mode 100644
index 8b06af3..0000000
--- a/rust/ballista/docker/rust.dockerfile
+++ /dev/null
@@ -1,71 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Turn .dockerignore to .dockerallow by excluding everything and explicitly
-# allowing specific files and directories. This enables us to quickly add
-# dependency files to the docker content without scanning the whole directory.
-# This setup requires to all of our docker containers have arrow's source
-# as a mounted directory.
-
-ARG RELEASE_FLAG=--release
-FROM ballistacompute/rust-base:0.4.0-20210213 AS base
-WORKDIR /tmp/ballista
-RUN apt-get -y install cmake
-RUN cargo install cargo-chef
-
-FROM base as planner
-COPY rust .
-RUN cargo chef prepare --recipe-path recipe.json
-
-FROM base as cacher
-COPY --from=planner /tmp/ballista/recipe.json recipe.json
-RUN cargo chef cook $RELEASE_FLAG --recipe-path recipe.json
-
-FROM base as builder
-COPY rust .
-COPY --from=cacher /tmp/ballista/target target
-ARG RELEASE_FLAG=--release
-
-# force build.rs to run to generate configure_me code.
-ENV FORCE_REBUILD='true'
-RUN cargo build $RELEASE_FLAG
-
-# put the executor on /executor (need to be copied from different places depending on FLAG)
-ENV RELEASE_FLAG=${RELEASE_FLAG}
-RUN if [ -z "$RELEASE_FLAG" ]; then mv /tmp/ballista/target/debug/ballista-executor /executor; else mv /tmp/ballista/target/release/ballista-executor /executor; fi
-
-# put the scheduler on /scheduler (need to be copied from different places depending on FLAG)
-ENV RELEASE_FLAG=${RELEASE_FLAG}
-RUN if [ -z "$RELEASE_FLAG" ]; then mv /tmp/ballista/target/debug/ballista-scheduler /scheduler; else mv /tmp/ballista/target/release/ballista-scheduler /scheduler; fi
-
-# put the tpch on /tpch (need to be copied from different places depending on FLAG)
-ENV RELEASE_FLAG=${RELEASE_FLAG}
-RUN if [ -z "$RELEASE_FLAG" ]; then mv /tmp/ballista/target/debug/tpch /tpch; else mv /tmp/ballista/target/release/tpch /tpch; fi
-
-# Copy the binary into a new container for a smaller docker image
-FROM ballistacompute/rust-base:0.4.0-20210213
-
-COPY --from=builder /executor /
-
-COPY --from=builder /scheduler /
-
-COPY --from=builder /tpch /
-
-ENV RUST_LOG=info
-ENV RUST_BACKTRACE=full
-
-CMD ["/executor", "--local"]
diff --git a/rust/ballista/docs/README.md b/rust/ballista/docs/README.md
deleted file mode 100644
index 44c831d..0000000
--- a/rust/ballista/docs/README.md
+++ /dev/null
@@ -1,37 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-# Ballista Developer Documentation
-
-This directory contains documentation for developers that are contributing to Ballista. If you are looking for
-end-user documentation for a published release, please start with the
-[Ballista User Guide](https://ballistacompute.org/docs/) instead.
-
-## Architecture & Design
-
-- Read the [Architecture Overview](architecture.md) to get an understanding of the scheduler and executor
- processes and how distributed query execution works.
-
-## Build, Test, Release
-
-- Setting up a [Rust development environment](dev-env-rust.md).
-- Setting up a [Java development environment](dev-env-jvm.md).
-- Notes on building [Rust docker images](rust-docker.md)
-- [Integration Testing](integration-testing.md)
-- [Release process](release-process.md)
-
diff --git a/rust/ballista/docs/architecture.md b/rust/ballista/docs/architecture.md
deleted file mode 100644
index a73b53a..0000000
--- a/rust/ballista/docs/architecture.md
+++ /dev/null
@@ -1,75 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-# Ballista Architecture
-
-## Overview
-
-Ballista allows queries to be executed in a distributed cluster. A cluster consists of one or
-more scheduler processes and one or more executor processes. See the following sections in this document for more
-details about these components.
-
-The scheduler accepts logical query plans and translates them into physical query plans using DataFusion and then
-runs a secondary planning/optimization process to translate the physical query plan into a distributed physical
-query plan.
-
-This process breaks a query down into a number of query stages that can be executed independently. There are
-dependencies between query stages and these dependencies form a directionally-acyclic graph (DAG) because a query
-stage cannot start until its child query stages have completed.
-
-Each query stage has one or more partitions that can be processed in parallel by the available
-executors in the cluster. This is the basic unit of scalability in Ballista.
-
-The following diagram shows the flow of requests and responses between the client, scheduler, and executor
-processes.
-
-![Query Execution Flow](images/query-execution.png)
-
-## Scheduler Process
-
-The scheduler process implements a gRPC interface (defined in
-[ballista.proto](../rust/ballista/proto/ballista.proto)). The interface provides the following methods:
-
-| Method | Description |
-|----------------------|----------------------------------------------------------------------|
-| ExecuteQuery | Submit a logical query plan or SQL query for execution |
-| GetExecutorsMetadata | Retrieves a list of executors that have registered with a scheduler |
-| GetFileMetadata | Retrieve metadata about files available in the cluster file system |
-| GetJobStatus | Get the status of a submitted query |
-| RegisterExecutor | Executors call this method to register themselves with the scheduler |
-
-The scheduler can run in standalone mode, or can be run in clustered mode using etcd as backing store for state.
-
-## Executor Process
-
-The executor process implements the Apache Arrow Flight gRPC interface and is responsible for:
-
-- Executing query stages and persisting the results to disk in Apache Arrow IPC Format
-- Making query stage results available as Flights so that they can be retrieved by other executors as well as by
- clients
-
-## Rust Client
-
-The Rust client provides a DataFrame API that is a thin wrapper around the DataFusion DataFrame and provides
-the means for a client to build a query plan for execution.
-
-The client executes the query plan by submitting an `ExecuteLogicalPlan` request to the scheduler and then calls
-`GetJobStatus` to check for completion. On completion, the client receives a list of locations for the Flights
-containing the results for the query and will then connect to the appropriate executor processes to retrieve
-those results.
-
diff --git a/rust/ballista/docs/dev-env-rust.md b/rust/ballista/docs/dev-env-rust.md
deleted file mode 100644
index bf50c9d..0000000
--- a/rust/ballista/docs/dev-env-rust.md
+++ /dev/null
@@ -1,38 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-# Setting up a Rust development environment
-
-You will need a standard Rust development environment. The easiest way to achieve this is by using rustup: https://rustup.rs/
-
-## Install OpenSSL
-
-Follow instructions for [setting up OpenSSL](https://docs.rs/openssl/0.10.28/openssl/). For Ubuntu users, the following
-command works.
-
-```bash
-sudo apt-get install pkg-config libssl-dev
-```
-
-## Install CMake
-
-You'll need cmake in order to compile some of ballista's dependencies. Ubuntu users can use the following command:
-
-```bash
-sudo apt-get install cmake
-```
\ No newline at end of file
diff --git a/rust/ballista/docs/images/query-execution.png b/rust/ballista/docs/images/query-execution.png
deleted file mode 100644
index b352402..0000000
Binary files a/rust/ballista/docs/images/query-execution.png and /dev/null differ
diff --git a/rust/ballista/docs/integration-testing.md b/rust/ballista/docs/integration-testing.md
deleted file mode 100644
index 2a979b6..0000000
--- a/rust/ballista/docs/integration-testing.md
+++ /dev/null
@@ -1,32 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-# Integration Testing
-
-Ballista has a [benchmark crate](https://github.com/ballista-compute/ballista/tree/main/rust/benchmarks/tpch) which is
-derived from TPC-H and this is currently the main form of integration testing.
-
-The following command can be used to run the integration tests.
-
-```bash
-./dev/integration-tests.sh
-```
-
-Please refer to the
-[benchmark documentation](https://github.com/ballista-compute/ballista/blob/main/rust/benchmarks/tpch/README.md)
-for more information.
diff --git a/rust/ballista/docs/release-process.md b/rust/ballista/docs/release-process.md
deleted file mode 100644
index c6c45c3..0000000
--- a/rust/ballista/docs/release-process.md
+++ /dev/null
@@ -1,68 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-# Release Process
-
-These instructions are for project maintainers wishing to create public releases of Ballista.
-
-- Create a `release-0.4` branch or merge latest from `main` into an existing `release-0.4` branch.
-- Update version numbers using `./dev/bump-version.sh`
-- Run integration tests with `./dev/integration-tests.sh`
-- Push changes
-- Create `v0.4.x` release tag from the `release-0.4` branch
-- Publish Docker images
-- Publish crate if possible (if we're using a published version of Arrow)
-
-## Publishing Java artifacts to Maven Central
-
-The JVM artifacts are published to Maven central by uploading to sonatype. You will need to set the environment
-variables `SONATYPE_USERNAME` and `SONATYPE_PASSWORD` to the correct values for your account and you will also need
-verified GPG keys available for signing the artifacts (instructions tbd).
-
-Run the follow commands to publish the artifacts to a sonatype staging repository.
-
-```bash
-./dev/publish-jvm.sh
-```
-
-## Publishing Rust Artifacts
-
-Run the following script to publish the Rust crate to crates.io.
-
-```
-./dev/publish-rust.sh
-```
-
-## Publishing Docker Images
-
-Run the following script to publish the executor Docker images to Docker Hub.
-
-```
-./dev/publish-docker-images.sh
-```
-
-## GPG Notes
-
-Refer to [this article](https://help.github.com/en/github/authenticating-to-github/generating-a-new-gpg-key) for
-instructions on setting up GPG keys. Some useful commands are:
-
-```bash
-gpg --full-generate-key
-gpg --export-secret-keys > ~/.gnupg/secring.gpg
-gpg --key-server keys.openpgp.org --send-keys KEYID
-```
\ No newline at end of file
diff --git a/rust/ballista/docs/rust-docker.md b/rust/ballista/docs/rust-docker.md
deleted file mode 100644
index 0b94a14..0000000
--- a/rust/ballista/docs/rust-docker.md
+++ /dev/null
@@ -1,66 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-### How to build rust's docker image
-
-To build the docker image in development, use
-
-```
-docker build -f docker/rust.dockerfile -t ballistacompute/ballista-rust:latest .
-```
-
-This uses a multi-stage build, on which the build stage is called `builder`.
-Our github has this target cached, that we use to speed-up the build time:
-
-```
-export BUILDER_IMAGE=docker.pkg.github.com/ballista-compute/ballista/ballista-rust-builder:main
-
-docker login docker.pkg.github.com -u ... -p ... # a personal access token to read from the read:packages
-docker pull $BUILDER_IMAGE
-
-docker build --cache-from $BUILDER_IMAGE -f docker/rust.dockerfile -t ballista:latest .
-```
-
-will build the image by re-using a cached image.
-
-### Docker images for development
-
-This project often requires testing on kubernetes. For this reason, we have a github workflow to push images to
-github's registry, both from this repo and its forks.
-
-The basic principle is that every push to a git reference builds and publishes a docker image.
-Specifically, given a branch or tag `${REF}`,
-
-* `docker.pkg.github.com/ballista-compute/ballista/ballista-rust:${REF}` is the latest image from $REF
-* `docker.pkg.github.com/${USER}/ballista/ballista-rust:${REF}` is the latest image from $REF on your fork
-
-To pull them from a kubernetes cluster or your computer, you need to have a personal access token with scope `read:packages`,
-and login to the registry `docker.pkg.github.com`.
-
-The builder image - the large image with all the cargo caches - is available on the same registry as described above, and is also
-available in all forks and for all references.
-
-Please refer to the [rust workflow](.github/workflows/rust.yaml) and [rust dockerfile](docker/rust.dockerfile) for details on how we build and publish these images.
-
-### Get the binary
-
-If you do not aim to run this in docker but any linux-based machine, you can get the latest binary from a docker image on the registry: the binary is statically linked and thus runs on any linux-based machine. You can get it using
-
-```
-id=$(docker create $BUILDER_IMAGE) && docker cp $id:/executor executor && docker rm -v $id
-```
diff --git a/rust/ballista/docs/user-guide/.gitignore b/rust/ballista/docs/user-guide/.gitignore
deleted file mode 100644
index e662f99..0000000
--- a/rust/ballista/docs/user-guide/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-ballista-book.tgz
-book
\ No newline at end of file
diff --git a/rust/ballista/docs/user-guide/README.md b/rust/ballista/docs/user-guide/README.md
deleted file mode 100644
index 9ee3e90..0000000
--- a/rust/ballista/docs/user-guide/README.md
+++ /dev/null
@@ -1,36 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-# Ballista User Guide Source
-
-This directory contains the sources for the user guide that is published at https://ballistacompute.org/docs/.
-
-## Generate HTML
-
-```bash
-cargo install mdbook
-mdbook build
-```
-
-## Deploy User Guide to Web Site
-
-Requires ssh certificate to be available.
-
-```bash
-./deploy.sh
-```
\ No newline at end of file
diff --git a/rust/ballista/docs/user-guide/book.toml b/rust/ballista/docs/user-guide/book.toml
deleted file mode 100644
index cf1653d..0000000
--- a/rust/ballista/docs/user-guide/book.toml
+++ /dev/null
@@ -1,23 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-[book]
-authors = ["Andy Grove"]
-language = "en"
-multilingual = false
-src = "src"
-title = "Ballista User Guide"
diff --git a/rust/ballista/docs/user-guide/src/SUMMARY.md b/rust/ballista/docs/user-guide/src/SUMMARY.md
deleted file mode 100644
index c8fc2c8..0000000
--- a/rust/ballista/docs/user-guide/src/SUMMARY.md
+++ /dev/null
@@ -1,30 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-# Summary
-
-- [Introduction](introduction.md)
-- [Create a Ballista Cluster](deployment.md)
- - [Docker](standalone.md)
- - [Docker Compose](docker-compose.md)
- - [Kubernetes](kubernetes.md)
- - [Ballista Configuration](configuration.md)
-- [Clients](clients.md)
- - [Rust](client-rust.md)
- - [Python](client-python.md)
-- [Frequently Asked Questions](faq.md)
\ No newline at end of file
diff --git a/rust/ballista/docs/user-guide/src/client-rust.md b/rust/ballista/docs/user-guide/src/client-rust.md
deleted file mode 100644
index 048c10f..0000000
--- a/rust/ballista/docs/user-guide/src/client-rust.md
+++ /dev/null
@@ -1,22 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-## Ballista Rust Client
-
-The Rust client supports a `DataFrame` API as well as SQL. See the
-[TPC-H Benchmark Client](https://github.com/ballista-compute/ballista/tree/main/rust/benchmarks/tpch) for an example.
\ No newline at end of file
diff --git a/rust/ballista/docs/user-guide/src/clients.md b/rust/ballista/docs/user-guide/src/clients.md
deleted file mode 100644
index 1e223dd..0000000
--- a/rust/ballista/docs/user-guide/src/clients.md
+++ /dev/null
@@ -1,22 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-## Clients
-
-- [Rust](client-rust.md)
-- [Python](client-python.md)
diff --git a/rust/ballista/docs/user-guide/src/configuration.md b/rust/ballista/docs/user-guide/src/configuration.md
deleted file mode 100644
index 52b05b0..0000000
--- a/rust/ballista/docs/user-guide/src/configuration.md
+++ /dev/null
@@ -1,32 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-# Configuration
-The rust executor and scheduler can be configured using toml files, environment variables and command line arguments. The specification for config options can be found in `rust/ballista/src/bin/[executor|scheduler]_config_spec.toml`.
-
-Those files fully define Ballista's configuration. If there is a discrepancy between this documentation and the files, assume those files are correct.
-
-To get a list of command line arguments, run the binary with `--help`
-
-There is an example config file at `ballista/rust/ballista/examples/example_executor_config.toml`
-
-The order of precedence for arguments is: default config file < environment variables < specified config file < command line arguments.
-
-The executor and scheduler will look for the default config file at `/etc/ballista/[executor|scheduler].toml` To specify a config file use the `--config-file` argument.
-
-Environment variables are prefixed by `BALLISTA_EXECUTOR` or `BALLISTA_SCHEDULER` for the executor and scheduler respectively. Hyphens in command line arguments become underscores. For example, the `--scheduler-host` argument for the executor becomes `BALLISTA_EXECUTOR_SCHEDULER_HOST`
\ No newline at end of file
diff --git a/rust/ballista/docs/user-guide/src/deployment.md b/rust/ballista/docs/user-guide/src/deployment.md
deleted file mode 100644
index 2432f2b..0000000
--- a/rust/ballista/docs/user-guide/src/deployment.md
+++ /dev/null
@@ -1,26 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-# Deployment
-
-Ballista is packaged as Docker images. Refer to the following guides to create a Ballista cluster:
-
-- [Create a cluster using Docker](standalone.md)
-- [Create a cluster using Docker Compose](docker-compose.md)
-- [Create a cluster using Kubernetes](kubernetes.md)
-
diff --git a/rust/ballista/docs/user-guide/src/docker-compose.md b/rust/ballista/docs/user-guide/src/docker-compose.md
deleted file mode 100644
index 2548e57..0000000
--- a/rust/ballista/docs/user-guide/src/docker-compose.md
+++ /dev/null
@@ -1,55 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-# Installing Ballista with Docker Compose
-
-Docker Compose is a convenient way to launch a cluister when testing locally. The following Docker Compose example
-demonstrates how to start a cluster using a single process that acts as both a scheduler and an executor, with a data
-volume mounted into the container so that Ballista can access the host file system.
-
-```yaml
-version: '2.0'
-services:
- etcd:
- image: quay.io/coreos/etcd:v3.4.9
- command: "etcd -advertise-client-urls http://etcd:2379 -listen-client-urls http://0.0.0.0:2379"
- ports:
- - "2379:2379"
- ballista-executor:
- image: ballistacompute/ballista-rust:0.4.2-SNAPSHOT
- command: "/executor --bind-host 0.0.0.0 --port 50051 --local"
- environment:
- - RUST_LOG=info
- ports:
- - "50050:50050"
- - "50051:50051"
- volumes:
- - ./data:/data
-
-
-```
-
-With the above content saved to a `docker-compose.yaml` file, the following command can be used to start the single
-node cluster.
-
-```bash
-docker-compose up
-```
-
-The scheduler listens on port 50050 and this is the port that clients will need to connect to.
diff --git a/rust/ballista/docs/user-guide/src/faq.md b/rust/ballista/docs/user-guide/src/faq.md
deleted file mode 100644
index b73a376..0000000
--- a/rust/ballista/docs/user-guide/src/faq.md
+++ /dev/null
@@ -1,31 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-# Frequently Asked Questions
-
-## What is the relationship between Apache Arrow, DataFusion, and Ballista?
-
-Apache Arrow is a library which provides a standardized memory representation for columnar data. It also provides
-"kernels" for performing common operations on this data.
-
-DataFusion is a library for executing queries in-process using the Apache Arrow memory
-model and computational kernels. It is designed to run within a single process, using threads
-for parallel query execution.
-
-Ballista is a distributed compute platform design to leverage DataFusion and other query
-execution libraries.
\ No newline at end of file
diff --git a/rust/ballista/docs/user-guide/src/img/ballista-architecture.png b/rust/ballista/docs/user-guide/src/img/ballista-architecture.png
deleted file mode 100644
index 2f78f29..0000000
Binary files a/rust/ballista/docs/user-guide/src/img/ballista-architecture.png and /dev/null differ
diff --git a/rust/ballista/docs/user-guide/src/introduction.md b/rust/ballista/docs/user-guide/src/introduction.md
deleted file mode 100644
index 59d7a1a..0000000
--- a/rust/ballista/docs/user-guide/src/introduction.md
+++ /dev/null
@@ -1,52 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-## Overview
-
-Ballista is a distributed compute platform primarily implemented in Rust, and powered by Apache Arrow. It is
-built on an architecture that allows other programming languages to be supported as first-class citizens without paying
-a penalty for serialization costs.
-
-The foundational technologies in Ballista are:
-
-- [Apache Arrow](https://arrow.apache.org/) memory model and compute kernels for efficient processing of data.
-- [Apache Arrow Flight Protocol](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for efficient data transfer between processes.
-- [Google Protocol Buffers](https://developers.google.com/protocol-buffers) for serializing query plans.
-- [Docker](https://www.docker.com/) for packaging up executors along with user-defined code.
-
-## Architecture
-
-The following diagram highlights some of the integrations that will be possible with this unique architecture. Note that not all components shown here are available yet.
-
-![Ballista Architecture Diagram](img/ballista-architecture.png)
-
-## How does this compare to Apache Spark?
-
-Although Ballista is largely inspired by Apache Spark, there are some key differences.
-
-- The choice of Rust as the main execution language means that memory usage is deterministic and avoids the overhead of GC pauses.
-- Ballista is designed from the ground up to use columnar data, enabling a number of efficiencies such as vectorized
-processing (SIMD and GPU) and efficient compression. Although Spark does have some columnar support, it is still
-largely row-based today.
-- The combination of Rust and Arrow provides excellent memory efficiency and memory usage can be 5x - 10x lower than Apache Spark in some cases, which means that more processing can fit on a single node, reducing the overhead of distributed compute.
-- The use of Apache Arrow as the memory model and network protocol means that data can be exchanged between executors in any programming language with minimal serialization overhead.
-
-## Status
-
-Ballista is at the proof-of-concept phase currently but is under active development by a growing community.
\ No newline at end of file
diff --git a/rust/ballista/docs/user-guide/src/kubernetes.md b/rust/ballista/docs/user-guide/src/kubernetes.md
deleted file mode 100644
index 8cd8bee..0000000
--- a/rust/ballista/docs/user-guide/src/kubernetes.md
+++ /dev/null
@@ -1,216 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-# Deploying Ballista with Kubernetes
-
-Ballista can be deployed to any Kubernetes cluster using the following instructions. These instructions assume that
-you are already comfortable with managing Kubernetes deployments.
-
-The k8s deployment consists of:
-
-- k8s stateful set for one or more scheduler processes
-- k8s stateful set for one or more executor processes
-- k8s service to route traffic to the schedulers
-- k8s persistent volume and persistent volume claims to make local data accessible to Ballista
-
-## Limitations
-
-Ballista is at an early stage of development and therefore has some significant limitations:
-
-- There is no support for shared object stores such as S3. All data must exist locally on each node in the
- cluster, including where any client process runs (until
- [#473](https://github.com/ballista-compute/ballista/issues/473) is resolved).
-- Only a single scheduler instance is currently supported unless the scheduler is configured to use `etcd` as a
- backing store.
-
-## Create Persistent Volume and Persistent Volume Claim
-
-Copy the following yaml to a `pv.yaml` file and apply to the cluster to create a persistent volume and a persistent
-volume claim so that the specified host directory is available to the containers. This is where any data should be
-located so that Ballista can execute queries against it.
-
-```yaml
-apiVersion: v1
-kind: PersistentVolume
-metadata:
- name: data-pv
- labels:
- type: local
-spec:
- storageClassName: manual
- capacity:
- storage: 10Gi
- accessModes:
- - ReadWriteOnce
- hostPath:
- path: "/mnt"
----
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
- name: data-pv-claim
-spec:
- storageClassName: manual
- accessModes:
- - ReadWriteOnce
- resources:
- requests:
- storage: 3Gi
-```
-
-To apply this yaml:
-
-```bash
-kubectl apply -f pv.yaml
-```
-
-You should see the following output:
-
-```bash
-persistentvolume/data-pv created
-persistentvolumeclaim/data-pv-claim created
-```
-
-## Deploying Ballista Scheduler and Executors
-
-Copy the following yaml to a `cluster.yaml` file.
-
-```yaml
-apiVersion: v1
-kind: Service
-metadata:
- name: ballista-scheduler
- labels:
- app: ballista-scheduler
-spec:
- ports:
- - port: 50050
- name: scheduler
- clusterIP: None
- selector:
- app: ballista-scheduler
----
-apiVersion: apps/v1
-kind: StatefulSet
-metadata:
- name: ballista-scheduler
-spec:
- serviceName: "ballista-scheduler"
- replicas: 1
- selector:
- matchLabels:
- app: ballista-scheduler
- template:
- metadata:
- labels:
- app: ballista-scheduler
- ballista-cluster: ballista
- spec:
- containers:
- - name: ballista-scheduler
- image: ballistacompute/ballista-rust:0.4.2-SNAPSHOT
- command: ["/scheduler"]
- args: ["--port=50050"]
- ports:
- - containerPort: 50050
- name: flight
- volumeMounts:
- - mountPath: /mnt
- name: data
- volumes:
- - name: data
- persistentVolumeClaim:
- claimName: data-pv-claim
----
-apiVersion: apps/v1
-kind: StatefulSet
-metadata:
- name: ballista-executor
-spec:
- serviceName: "ballista-scheduler"
- replicas: 2
- selector:
- matchLabels:
- app: ballista-executor
- template:
- metadata:
- labels:
- app: ballista-executor
- ballista-cluster: ballista
- spec:
- containers:
- - name: ballista-executor
- image: ballistacompute/ballista-rust:0.4.2-SNAPSHOT
- command: ["/executor"]
- args: ["--port=50051", "--scheduler-host=ballista-scheduler", "--scheduler-port=50050", "--external-host=$(MY_POD_IP)"]
- env:
- - name: MY_POD_IP
- valueFrom:
- fieldRef:
- fieldPath: status.podIP
- ports:
- - containerPort: 50051
- name: flight
- volumeMounts:
- - mountPath: /mnt
- name: data
- volumes:
- - name: data
- persistentVolumeClaim:
- claimName: data-pv-claim
-```
-
-```bash
-$ kubectl apply -f cluster.yaml
-```
-
-This should show the following output:
-
-```
-service/ballista-scheduler created
-statefulset.apps/ballista-scheduler created
-statefulset.apps/ballista-executor created
-```
-
-You can also check status by running `kubectl get pods`:
-
-```bash
-$ kubectl get pods
-NAME READY STATUS RESTARTS AGE
-busybox 1/1 Running 0 16m
-ballista-scheduler-0 1/1 Running 0 42s
-ballista-executor-0 1/1 Running 2 42s
-ballista-executor-1 1/1 Running 0 26s
-```
-
-You can view the scheduler logs with `kubectl logs ballista-scheduler-0`:
-
-```
-$ kubectl logs ballista-scheduler-0
-[2021-02-19T00:24:01Z INFO scheduler] Ballista v0.4.2-SNAPSHOT Scheduler listening on 0.0.0.0:50050
-[2021-02-19T00:24:16Z INFO ballista::scheduler] Received register_executor request for ExecutorMetadata { id: "b5e81711-1c5c-46ec-8522-d8b359793188", host: "10.1.23.149", port: 50051 }
-[2021-02-19T00:24:17Z INFO ballista::scheduler] Received register_executor request for ExecutorMetadata { id: "816e4502-a876-4ed8-b33f-86d243dcf63f", host: "10.1.23.150", port: 50051 }
-```
-
-## Deleting the Ballista cluster
-
-Run the following kubectl command to delete the cluster.
-
-```bash
-kubectl delete -f cluster.yaml
-```
\ No newline at end of file
diff --git a/rust/ballista/docs/user-guide/src/standalone.md b/rust/ballista/docs/user-guide/src/standalone.md
deleted file mode 100644
index e4c24fe..0000000
--- a/rust/ballista/docs/user-guide/src/standalone.md
+++ /dev/null
@@ -1,92 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-## Deploying a standalone Ballista cluster
-
-### Start a Scheduler
-
-Start a scheduler using the following syntax:
-
-```bash
-docker run --network=host \
- -d ballistacompute/ballista-rust:0.4.2-SNAPSHOT \
- /scheduler --port 50050
-```
-
-Run `docker ps` to check that the process is running:
-
-```
-$ docker ps
-CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
-59452ce72138 ballistacompute/ballista-rust:0.4.2-SNAPSHOT "/scheduler --port 5…" 6 seconds ago Up 5 seconds affectionate_hofstadter
-```
-
-Run `docker logs CONTAINER_ID` to check the output from the process:
-
-```
-$ docker logs 59452ce72138
-[2021-02-14T18:32:20Z INFO scheduler] Ballista v0.4.2-SNAPSHOT Scheduler listening on 0.0.0.0:50050
-```
-
-### Start executors
-
-Start one or more executor processes. Each executor process will need to listen on a different port.
-
-```bash
-docker run --network=host \
- -d ballistacompute/ballista-rust:0.4.2-SNAPSHOT \
- /executor --external-host localhost --port 50051
-```
-
-Use `docker ps` to check that both the scheduer and executor(s) are now running:
-
-```
-$ docker ps
-CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
-0746ce262a19 ballistacompute/ballista-rust:0.4.2-SNAPSHOT "/executor --externa…" 2 seconds ago Up 1 second naughty_mclean
-59452ce72138 ballistacompute/ballista-rust:0.4.2-SNAPSHOT "/scheduler --port 5…" 4 minutes ago Up 4 minutes affectionate_hofstadter
-```
-
-Use `docker logs CONTAINER_ID` to check the output from the executor(s):
-
-```
-$ docker logs 0746ce262a19
-[2021-02-14T18:36:25Z INFO executor] Running with config: ExecutorConfig { host: "localhost", port: 50051, work_dir: "/tmp/.tmpVRFSvn", concurrent_tasks: 4 }
-[2021-02-14T18:36:25Z INFO executor] Ballista v0.4.2-SNAPSHOT Rust Executor listening on 0.0.0.0:50051
-[2021-02-14T18:36:25Z INFO executor] Starting registration with scheduler
-```
-
-The external host and port will be registered with the scheduler. The executors will discover other executors by
-requesting a list of executors from the scheduler.
-
-### Using etcd as backing store
-
-_NOTE: This functionality is currently experimental_
-
-Ballista can optionally use [etcd](https://etcd.io/) as a backing store for the scheduler.
-
-```bash
-docker run --network=host \
- -d ballistacompute/ballista-rust:0.4.2-SNAPSHOT \
- /scheduler --port 50050 \
- --config-backend etcd \
- --etcd-urls etcd:2379
-```
-
-Please refer to the [etcd](https://etcd.io/) web site for installation instructions. Etcd version 3.4.9 or later is
-recommended.
diff --git a/rust/ballista/rust/.dockerignore b/rust/ballista/rust/.dockerignore
deleted file mode 100644
index 96f99a5..0000000
--- a/rust/ballista/rust/.dockerignore
+++ /dev/null
@@ -1,23 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Turn .dockerignore to .dockerallow by excluding everything and explicitly
-# allowing specific files and directories. This enables us to quickly add
-# dependency files to the docker content without scanning the whole directory.
-# This setup requires to all of our docker containers have arrow's source
-# as a mounted directory.
-target
\ No newline at end of file
diff --git a/rust/ballista/rust/.gitignore b/rust/ballista/rust/.gitignore
deleted file mode 100644
index 97eec16..0000000
--- a/rust/ballista/rust/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-target
-temp
\ No newline at end of file
diff --git a/rust/ballista/rust/Cargo.toml b/rust/ballista/rust/Cargo.toml
deleted file mode 100644
index 5e344e0..0000000
--- a/rust/ballista/rust/Cargo.toml
+++ /dev/null
@@ -1,30 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-[workspace]
-
-members = [
- "benchmarks/tpch",
- "client",
- "core",
- "executor",
- "scheduler",
-]
-
-#[profile.release]
-#lto = true
-#codegen-units = 1
diff --git a/rust/ballista/rust/benchmarks/tpch/.dockerignore b/rust/ballista/rust/benchmarks/tpch/.dockerignore
deleted file mode 100644
index 3a7d0fd..0000000
--- a/rust/ballista/rust/benchmarks/tpch/.dockerignore
+++ /dev/null
@@ -1,25 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Turn .dockerignore to .dockerallow by excluding everything and explicitly
-# allowing specific files and directories. This enables us to quickly add
-# dependency files to the docker content without scanning the whole directory.
-# This setup requires to all of our docker containers have arrow's source
-# as a mounted directory.
-
-data
-target
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/.gitignore b/rust/ballista/rust/benchmarks/tpch/.gitignore
deleted file mode 100644
index 6320cd2..0000000
--- a/rust/ballista/rust/benchmarks/tpch/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-data
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/Cargo.toml b/rust/ballista/rust/benchmarks/tpch/Cargo.toml
deleted file mode 100644
index 822d101..0000000
--- a/rust/ballista/rust/benchmarks/tpch/Cargo.toml
+++ /dev/null
@@ -1,36 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-[package]
-name = "tpch"
-version = "0.4.2-SNAPSHOT"
-homepage = "https://github.com/apache/arrow"
-repository = "https://github.com/apache/arrow"
-authors = ["Apache Arrow <de...@arrow.apache.org>"]
-license = "Apache-2.0"
-edition = "2018"
-
-[dependencies]
-ballista = { path="../../client" }
-
-arrow = { path = "../../../../arrow" }
-datafusion = { path = "../../../../datafusion" }
-parquet = { path = "../../../../parquet" }
-
-env_logger = "0.8"
-tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread"] }
-structopt = "0.3"
diff --git a/rust/ballista/rust/benchmarks/tpch/README.md b/rust/ballista/rust/benchmarks/tpch/README.md
deleted file mode 100644
index 6d77694..0000000
--- a/rust/ballista/rust/benchmarks/tpch/README.md
+++ /dev/null
@@ -1,104 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-# TPC-H Benchmarks
-
-TPC-H is an industry standard benchmark for testing databases and query engines. A command-line tool is available that
-can generate the raw test data at any given scale factor (scale factor refers to the amount of data to be generated).
-
-## Generating Test Data
-
-TPC-H data can be generated using the `tpch-gen.sh` script, which creates a Docker image containing the TPC-DS data
-generator.
-
-```bash
-./tpch-gen.sh
-```
-
-Data will be generated into the `data` subdirectory and will not be checked in because this directory has been added
-to the `.gitignore` file.
-
-## Running the Benchmarks
-
-To run the benchmarks it is necessary to have at least one Ballista scheduler and one Ballista executor running.
-
-To run the scheduler from source:
-
-```bash
-cd $ARROW_HOME/rust/ballista/rust/scheduler
-RUST_LOG=info cargo run --release
-```
-
-By default the scheduler will bind to `0.0.0.0` and listen on port 50050.
-
-To run the executor from source:
-
-```bash
-cd $ARROW_HOME/rust/ballista/rust/executor
-RUST_LOG=info cargo run --release
-```
-
-By default the executor will bind to `0.0.0.0` and listen on port 50051.
-
-You can add SIMD/snmalloc/LTO flags to improve speed (with longer build times):
-
-```
-RUST_LOG=info RUSTFLAGS='-C target-cpu=native -C lto -C codegen-units=1 -C embed-bitcode' cargo run --release --bin executor --features "simd snmalloc" --target x86_64-unknown-linux-gnu
-```
-
-To run the benchmarks:
-
-```bash
-cd $ARROW_HOME/rust/ballista/rust/benchmarks/tpch
-cargo run --release benchmark --host localhost --port 50050 --query 1 --path $(pwd)/data --format tbl
-```
-
-## Running the Benchmarks on docker-compose
-
-To start a Rust scheduler and executor using Docker Compose:
-
-```bash
-cd $BALLISTA_HOME
-./dev/build-rust.sh
-cd $BALLISTA_HOME/rust/benchmarks/tpch
-docker-compose up
-```
-
-Then you can run the benchmark with:
-
-```bash
-docker-compose run ballista-client cargo run benchmark --host ballista-scheduler --port 50050 --query 1 --path /data --format tbl
-```
-
-## Expected output
-
-The result of query 1 should produce the following output when executed against the SF=1 dataset.
-
-```
-+--------------+--------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+-------------+
-| l_returnflag | l_linestatus | sum_qty | sum_base_price | sum_disc_price | sum_charge | avg_qty | avg_price | avg_disc | count_order |
-+--------------+--------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+-------------+
-| A | F | 37734107 | 56586554400.73001 | 53758257134.870026 | 55909065222.82768 | 25.522005853257337 | 38273.12973462168 | 0.049985295838396455 | 1478493 |
-| N | F | 991417 | 1487504710.3799996 | 1413082168.0541 | 1469649223.1943746 | 25.516471920522985 | 38284.467760848296 | 0.05009342667421622 | 38854 |
-| N | O | 74476023 | 111701708529.50996 | 106118209986.10472 | 110367023144.56622 | 25.502229680934594 | 38249.1238377803 | 0.049996589476752576 | 2920373 |
-| R | F | 37719753 | 56568041380.90001 | 53741292684.60399 | 55889619119.83194 | 25.50579361269077 | 38250.854626099666 | 0.05000940583012587 | 1478870 |
-+--------------+--------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+-------------+
-Query 1 iteration 0 took 1956.1 ms
-Query 1 avg time: 1956.11 ms
-```
diff --git a/rust/ballista/rust/benchmarks/tpch/docker-compose.yaml b/rust/ballista/rust/benchmarks/tpch/docker-compose.yaml
deleted file mode 100644
index f872ce1..0000000
--- a/rust/ballista/rust/benchmarks/tpch/docker-compose.yaml
+++ /dev/null
@@ -1,62 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-version: '2.0'
-services:
- etcd:
- image: quay.io/coreos/etcd:v3.4.9
- command: "etcd -advertise-client-urls http://etcd:2379 -listen-client-urls http://0.0.0.0:2379"
- ballista-scheduler:
- image: ballistacompute/ballista-rust:0.4.2-SNAPSHOT
- command: "/scheduler --config-backend etcd --etcd-urls etcd:2379 --bind-host 0.0.0.0 --port 50050"
- environment:
- - RUST_LOG=ballista=debug
- volumes:
- - ./data:/data
- depends_on:
- - etcd
- ballista-executor-1:
- image: ballistacompute/ballista-rust:0.4.2-SNAPSHOT
- command: "/executor --bind-host 0.0.0.0 --port 50051 --external-host ballista-executor-1 --scheduler-host ballista-scheduler"
- environment:
- - RUST_LOG=info
- volumes:
- - ./data:/data
- depends_on:
- - ballista-scheduler
- ballista-executor-2:
- image: ballistacompute/ballista-rust:0.4.2-SNAPSHOT
- command: "/executor --bind-host 0.0.0.0 --port 50052 --external-host ballista-executor-2 --scheduler-host ballista-scheduler"
- environment:
- - RUST_LOG=info
- volumes:
- - ./data:/data
- depends_on:
- - ballista-scheduler
- ballista-client:
- image: ballistacompute/ballista-rust:0.4.2-SNAPSHOT
- command: "/bin/sh" # do nothing
- working_dir: /ballista/benchmarks/tpch
- environment:
- - RUST_LOG=info
- volumes:
- - ./data:/data
- - ../..:/ballista
- depends_on:
- - ballista-scheduler
- - ballista-executor-1
- - ballista-executor-2
-
diff --git a/rust/ballista/rust/benchmarks/tpch/entrypoint.sh b/rust/ballista/rust/benchmarks/tpch/entrypoint.sh
deleted file mode 100755
index 71c0432..0000000
--- a/rust/ballista/rust/benchmarks/tpch/entrypoint.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-cd /tpch-dbgen
-./dbgen -vf -s 1
-mv *.tbl /data
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q1.sql b/rust/ballista/rust/benchmarks/tpch/queries/q1.sql
deleted file mode 100644
index a0fcf15..0000000
--- a/rust/ballista/rust/benchmarks/tpch/queries/q1.sql
+++ /dev/null
@@ -1,21 +0,0 @@
-select
- l_returnflag,
- l_linestatus,
- sum(l_quantity) as sum_qty,
- sum(l_extendedprice) as sum_base_price,
- sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
- sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
- avg(l_quantity) as avg_qty,
- avg(l_extendedprice) as avg_price,
- avg(l_discount) as avg_disc,
- count(*) as count_order
-from
- lineitem
-where
- l_shipdate <= date '1998-09-02'
-group by
- l_returnflag,
- l_linestatus
-order by
- l_returnflag,
- l_linestatus;
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q10.sql b/rust/ballista/rust/benchmarks/tpch/queries/q10.sql
deleted file mode 100644
index cf45e43..0000000
--- a/rust/ballista/rust/benchmarks/tpch/queries/q10.sql
+++ /dev/null
@@ -1,31 +0,0 @@
-select
- c_custkey,
- c_name,
- sum(l_extendedprice * (1 - l_discount)) as revenue,
- c_acctbal,
- n_name,
- c_address,
- c_phone,
- c_comment
-from
- customer,
- orders,
- lineitem,
- nation
-where
- c_custkey = o_custkey
- and l_orderkey = o_orderkey
- and o_orderdate >= date '1993-10-01'
- and o_orderdate < date '1994-01-01'
- and l_returnflag = 'R'
- and c_nationkey = n_nationkey
-group by
- c_custkey,
- c_name,
- c_acctbal,
- c_phone,
- n_name,
- c_address,
- c_comment
-order by
- revenue desc;
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q11.sql b/rust/ballista/rust/benchmarks/tpch/queries/q11.sql
deleted file mode 100644
index c23ed1c..0000000
--- a/rust/ballista/rust/benchmarks/tpch/queries/q11.sql
+++ /dev/null
@@ -1,27 +0,0 @@
-select
- ps_partkey,
- sum(ps_supplycost * ps_availqty) as value
-from
- partsupp,
- supplier,
- nation
-where
- ps_suppkey = s_suppkey
- and s_nationkey = n_nationkey
- and n_name = 'GERMANY'
-group by
- ps_partkey having
- sum(ps_supplycost * ps_availqty) > (
- select
- sum(ps_supplycost * ps_availqty) * 0.0001
- from
- partsupp,
- supplier,
- nation
- where
- ps_suppkey = s_suppkey
- and s_nationkey = n_nationkey
- and n_name = 'GERMANY'
- )
-order by
- value desc;
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q12.sql b/rust/ballista/rust/benchmarks/tpch/queries/q12.sql
deleted file mode 100644
index f8e6d96..0000000
--- a/rust/ballista/rust/benchmarks/tpch/queries/q12.sql
+++ /dev/null
@@ -1,30 +0,0 @@
-select
- l_shipmode,
- sum(case
- when o_orderpriority = '1-URGENT'
- or o_orderpriority = '2-HIGH'
- then 1
- else 0
- end) as high_line_count,
- sum(case
- when o_orderpriority <> '1-URGENT'
- and o_orderpriority <> '2-HIGH'
- then 1
- else 0
- end) as low_line_count
-from
- lineitem
- join
- orders
- on
- l_orderkey = o_orderkey
-where
- l_shipmode in ('MAIL', 'SHIP')
- and l_commitdate < l_receiptdate
- and l_shipdate < l_commitdate
- and l_receiptdate >= date '1994-01-01'
- and l_receiptdate < date '1995-01-01'
-group by
- l_shipmode
-order by
- l_shipmode;
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q13.sql b/rust/ballista/rust/benchmarks/tpch/queries/q13.sql
deleted file mode 100644
index 4bfe8c3..0000000
--- a/rust/ballista/rust/benchmarks/tpch/queries/q13.sql
+++ /dev/null
@@ -1,20 +0,0 @@
-select
- c_count,
- count(*) as custdist
-from
- (
- select
- c_custkey,
- count(o_orderkey)
- from
- customer left outer join orders on
- c_custkey = o_custkey
- and o_comment not like '%special%requests%'
- group by
- c_custkey
- ) as c_orders (c_custkey, c_count)
-group by
- c_count
-order by
- custdist desc,
- c_count desc;
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q14.sql b/rust/ballista/rust/benchmarks/tpch/queries/q14.sql
deleted file mode 100644
index d8ef6af..0000000
--- a/rust/ballista/rust/benchmarks/tpch/queries/q14.sql
+++ /dev/null
@@ -1,13 +0,0 @@
-select
- 100.00 * sum(case
- when p_type like 'PROMO%'
- then l_extendedprice * (1 - l_discount)
- else 0
- end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue
-from
- lineitem,
- part
-where
- l_partkey = p_partkey
- and l_shipdate >= date '1995-09-01'
- and l_shipdate < date '1995-10-01';
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q16.sql b/rust/ballista/rust/benchmarks/tpch/queries/q16.sql
deleted file mode 100644
index 36b7c07..0000000
--- a/rust/ballista/rust/benchmarks/tpch/queries/q16.sql
+++ /dev/null
@@ -1,30 +0,0 @@
-select
- p_brand,
- p_type,
- p_size,
- count(distinct ps_suppkey) as supplier_cnt
-from
- partsupp,
- part
-where
- p_partkey = ps_partkey
- and p_brand <> 'Brand#45'
- and p_type not like 'MEDIUM POLISHED%'
- and p_size in (49, 14, 23, 45, 19, 3, 36, 9)
- and ps_suppkey not in (
- select
- s_suppkey
- from
- supplier
- where
- s_comment like '%Customer%Complaints%'
-)
-group by
- p_brand,
- p_type,
- p_size
-order by
- supplier_cnt desc,
- p_brand,
- p_type,
- p_size;
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q17.sql b/rust/ballista/rust/benchmarks/tpch/queries/q17.sql
deleted file mode 100644
index 1e65550..0000000
--- a/rust/ballista/rust/benchmarks/tpch/queries/q17.sql
+++ /dev/null
@@ -1,17 +0,0 @@
-select
- sum(l_extendedprice) / 7.0 as avg_yearly
-from
- lineitem,
- part
-where
- p_partkey = l_partkey
- and p_brand = 'Brand#23'
- and p_container = 'MED BOX'
- and l_quantity < (
- select
- 0.2 * avg(l_quantity)
- from
- lineitem
- where
- l_partkey = p_partkey
-);
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q18.sql b/rust/ballista/rust/benchmarks/tpch/queries/q18.sql
deleted file mode 100644
index 835de28..0000000
--- a/rust/ballista/rust/benchmarks/tpch/queries/q18.sql
+++ /dev/null
@@ -1,32 +0,0 @@
-select
- c_name,
- c_custkey,
- o_orderkey,
- o_orderdate,
- o_totalprice,
- sum(l_quantity)
-from
- customer,
- orders,
- lineitem
-where
- o_orderkey in (
- select
- l_orderkey
- from
- lineitem
- group by
- l_orderkey having
- sum(l_quantity) > 300
- )
- and c_custkey = o_custkey
- and o_orderkey = l_orderkey
-group by
- c_name,
- c_custkey,
- o_orderkey,
- o_orderdate,
- o_totalprice
-order by
- o_totalprice desc,
- o_orderdate;
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q19.sql b/rust/ballista/rust/benchmarks/tpch/queries/q19.sql
deleted file mode 100644
index 56668e7..0000000
--- a/rust/ballista/rust/benchmarks/tpch/queries/q19.sql
+++ /dev/null
@@ -1,35 +0,0 @@
-select
- sum(l_extendedprice* (1 - l_discount)) as revenue
-from
- lineitem,
- part
-where
- (
- p_partkey = l_partkey
- and p_brand = 'Brand#12'
- and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
- and l_quantity >= 1 and l_quantity <= 1 + 10
- and p_size between 1 and 5
- and l_shipmode in ('AIR', 'AIR REG')
- and l_shipinstruct = 'DELIVER IN PERSON'
- )
- or
- (
- p_partkey = l_partkey
- and p_brand = 'Brand#23'
- and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
- and l_quantity >= 10 and l_quantity <= 10 + 10
- and p_size between 1 and 10
- and l_shipmode in ('AIR', 'AIR REG')
- and l_shipinstruct = 'DELIVER IN PERSON'
- )
- or
- (
- p_partkey = l_partkey
- and p_brand = 'Brand#34'
- and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
- and l_quantity >= 20 and l_quantity <= 20 + 10
- and p_size between 1 and 15
- and l_shipmode in ('AIR', 'AIR REG')
- and l_shipinstruct = 'DELIVER IN PERSON'
- );
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q2.sql b/rust/ballista/rust/benchmarks/tpch/queries/q2.sql
deleted file mode 100644
index f66af21..0000000
--- a/rust/ballista/rust/benchmarks/tpch/queries/q2.sql
+++ /dev/null
@@ -1,43 +0,0 @@
-select
- s_acctbal,
- s_name,
- n_name,
- p_partkey,
- p_mfgr,
- s_address,
- s_phone,
- s_comment
-from
- part,
- supplier,
- partsupp,
- nation,
- region
-where
- p_partkey = ps_partkey
- and s_suppkey = ps_suppkey
- and p_size = 15
- and p_type like '%BRASS'
- and s_nationkey = n_nationkey
- and n_regionkey = r_regionkey
- and r_name = 'EUROPE'
- and ps_supplycost = (
- select
- min(ps_supplycost)
- from
- partsupp,
- supplier,
- nation,
- region
- where
- p_partkey = ps_partkey
- and s_suppkey = ps_suppkey
- and s_nationkey = n_nationkey
- and n_regionkey = r_regionkey
- and r_name = 'EUROPE'
-)
-order by
- s_acctbal desc,
- n_name,
- s_name,
- p_partkey;
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q20.sql b/rust/ballista/rust/benchmarks/tpch/queries/q20.sql
deleted file mode 100644
index f0339a6..0000000
--- a/rust/ballista/rust/benchmarks/tpch/queries/q20.sql
+++ /dev/null
@@ -1,37 +0,0 @@
-select
- s_name,
- s_address
-from
- supplier,
- nation
-where
- s_suppkey in (
- select
- ps_suppkey
- from
- partsupp
- where
- ps_partkey in (
- select
- p_partkey
- from
- part
- where
- p_name like 'forest%'
- )
- and ps_availqty > (
- select
- 0.5 * sum(l_quantity)
- from
- lineitem
- where
- l_partkey = ps_partkey
- and l_suppkey = ps_suppkey
- and l_shipdate >= date '1994-01-01'
- and l_shipdate < 'date 1994-01-01' + interval '1' year
- )
- )
- and s_nationkey = n_nationkey
- and n_name = 'CANADA'
-order by
- s_name;
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q21.sql b/rust/ballista/rust/benchmarks/tpch/queries/q21.sql
deleted file mode 100644
index 9d2fe32..0000000
--- a/rust/ballista/rust/benchmarks/tpch/queries/q21.sql
+++ /dev/null
@@ -1,39 +0,0 @@
-select
- s_name,
- count(*) as numwait
-from
- supplier,
- lineitem l1,
- orders,
- nation
-where
- s_suppkey = l1.l_suppkey
- and o_orderkey = l1.l_orderkey
- and o_orderstatus = 'F'
- and l1.l_receiptdate > l1.l_commitdate
- and exists (
- select
- *
- from
- lineitem l2
- where
- l2.l_orderkey = l1.l_orderkey
- and l2.l_suppkey <> l1.l_suppkey
- )
- and not exists (
- select
- *
- from
- lineitem l3
- where
- l3.l_orderkey = l1.l_orderkey
- and l3.l_suppkey <> l1.l_suppkey
- and l3.l_receiptdate > l3.l_commitdate
- )
- and s_nationkey = n_nationkey
- and n_name = 'SAUDI ARABIA'
-group by
- s_name
-order by
- numwait desc,
- s_name;
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q22.sql b/rust/ballista/rust/benchmarks/tpch/queries/q22.sql
deleted file mode 100644
index 90aea6f..0000000
--- a/rust/ballista/rust/benchmarks/tpch/queries/q22.sql
+++ /dev/null
@@ -1,37 +0,0 @@
-select
- cntrycode,
- count(*) as numcust,
- sum(c_acctbal) as totacctbal
-from
- (
- select
- substring(c_phone from 1 for 2) as cntrycode,
- c_acctbal
- from
- customer
- where
- substring(c_phone from 1 for 2) in
- ('13', '31', '23', '29', '30', '18', '17')
- and c_acctbal > (
- select
- avg(c_acctbal)
- from
- customer
- where
- c_acctbal > 0.00
- and substring(c_phone from 1 for 2) in
- ('13', '31', '23', '29', '30', '18', '17')
- )
- and not exists (
- select
- *
- from
- orders
- where
- o_custkey = c_custkey
- )
- ) as custsale
-group by
- cntrycode
-order by
- cntrycode;
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q3.sql b/rust/ballista/rust/benchmarks/tpch/queries/q3.sql
deleted file mode 100644
index 7dbc6d9..0000000
--- a/rust/ballista/rust/benchmarks/tpch/queries/q3.sql
+++ /dev/null
@@ -1,22 +0,0 @@
-select
- l_orderkey,
- sum(l_extendedprice * (1 - l_discount)) as revenue,
- o_orderdate,
- o_shippriority
-from
- customer,
- orders,
- lineitem
-where
- c_mktsegment = 'BUILDING'
- and c_custkey = o_custkey
- and l_orderkey = o_orderkey
- and o_orderdate < date '1995-03-15'
- and l_shipdate > date '1995-03-15'
-group by
- l_orderkey,
- o_orderdate,
- o_shippriority
-order by
- revenue desc,
- o_orderdate;
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q4.sql b/rust/ballista/rust/benchmarks/tpch/queries/q4.sql
deleted file mode 100644
index 74a620d..0000000
--- a/rust/ballista/rust/benchmarks/tpch/queries/q4.sql
+++ /dev/null
@@ -1,21 +0,0 @@
-select
- o_orderpriority,
- count(*) as order_count
-from
- orders
-where
- o_orderdate >= '1993-07-01'
- and o_orderdate < date '1993-07-01' + interval '3' month
- and exists (
- select
- *
- from
- lineitem
- where
- l_orderkey = o_orderkey
- and l_commitdate < l_receiptdate
- )
-group by
- o_orderpriority
-order by
- o_orderpriority;
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q5.sql b/rust/ballista/rust/benchmarks/tpch/queries/q5.sql
deleted file mode 100644
index 5a336b2..0000000
--- a/rust/ballista/rust/benchmarks/tpch/queries/q5.sql
+++ /dev/null
@@ -1,24 +0,0 @@
-select
- n_name,
- sum(l_extendedprice * (1 - l_discount)) as revenue
-from
- customer,
- orders,
- lineitem,
- supplier,
- nation,
- region
-where
- c_custkey = o_custkey
- and l_orderkey = o_orderkey
- and l_suppkey = s_suppkey
- and c_nationkey = s_nationkey
- and s_nationkey = n_nationkey
- and n_regionkey = r_regionkey
- and r_name = 'ASIA'
- and o_orderdate >= date '1994-01-01'
- and o_orderdate < date '1995-01-01'
-group by
- n_name
-order by
- revenue desc;
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q6.sql b/rust/ballista/rust/benchmarks/tpch/queries/q6.sql
deleted file mode 100644
index 5806f98..0000000
--- a/rust/ballista/rust/benchmarks/tpch/queries/q6.sql
+++ /dev/null
@@ -1,9 +0,0 @@
-select
- sum(l_extendedprice * l_discount) as revenue
-from
- lineitem
-where
- l_shipdate >= date '1994-01-01'
- and l_shipdate < date '1995-01-01'
- and l_discount between 0.06 - 0.01 and 0.06 + 0.01
- and l_quantity < 24;
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q7.sql b/rust/ballista/rust/benchmarks/tpch/queries/q7.sql
deleted file mode 100644
index d53877c..0000000
--- a/rust/ballista/rust/benchmarks/tpch/queries/q7.sql
+++ /dev/null
@@ -1,39 +0,0 @@
-select
- supp_nation,
- cust_nation,
- l_year,
- sum(volume) as revenue
-from
- (
- select
- n1.n_name as supp_nation,
- n2.n_name as cust_nation,
- extract(year from l_shipdate) as l_year,
- l_extendedprice * (1 - l_discount) as volume
- from
- supplier,
- lineitem,
- orders,
- customer,
- nation n1,
- nation n2
- where
- s_suppkey = l_suppkey
- and o_orderkey = l_orderkey
- and c_custkey = o_custkey
- and s_nationkey = n1.n_nationkey
- and c_nationkey = n2.n_nationkey
- and (
- (n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY')
- or (n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE')
- )
- and l_shipdate between date '1995-01-01' and date '1996-12-31'
- ) as shipping
-group by
- supp_nation,
- cust_nation,
- l_year
-order by
- supp_nation,
- cust_nation,
- l_year;
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q8.sql b/rust/ballista/rust/benchmarks/tpch/queries/q8.sql
deleted file mode 100644
index 6ddb2a6..0000000
--- a/rust/ballista/rust/benchmarks/tpch/queries/q8.sql
+++ /dev/null
@@ -1,37 +0,0 @@
-select
- o_year,
- sum(case
- when nation = 'BRAZIL' then volume
- else 0
- end) / sum(volume) as mkt_share
-from
- (
- select
- extract(year from o_orderdate) as o_year,
- l_extendedprice * (1 - l_discount) as volume,
- n2.n_name as nation
- from
- part,
- supplier,
- lineitem,
- orders,
- customer,
- nation n1,
- nation n2,
- region
- where
- p_partkey = l_partkey
- and s_suppkey = l_suppkey
- and l_orderkey = o_orderkey
- and o_custkey = c_custkey
- and c_nationkey = n1.n_nationkey
- and n1.n_regionkey = r_regionkey
- and r_name = 'AMERICA'
- and s_nationkey = n2.n_nationkey
- and o_orderdate between date '1995-01-01' and date '1996-12-31'
- and p_type = 'ECONOMY ANODIZED STEEL'
- ) as all_nations
-group by
- o_year
-order by
- o_year;
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q9.sql b/rust/ballista/rust/benchmarks/tpch/queries/q9.sql
deleted file mode 100644
index 587bbc8..0000000
--- a/rust/ballista/rust/benchmarks/tpch/queries/q9.sql
+++ /dev/null
@@ -1,32 +0,0 @@
-select
- nation,
- o_year,
- sum(amount) as sum_profit
-from
- (
- select
- n_name as nation,
- extract(year from o_orderdate) as o_year,
- l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
- from
- part,
- supplier,
- lineitem,
- partsupp,
- orders,
- nation
- where
- s_suppkey = l_suppkey
- and ps_suppkey = l_suppkey
- and ps_partkey = l_partkey
- and p_partkey = l_partkey
- and o_orderkey = l_orderkey
- and s_nationkey = n_nationkey
- and p_name like '%green%'
- ) as profit
-group by
- nation,
- o_year
-order by
- nation,
- o_year desc;
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/run.sh b/rust/ballista/rust/benchmarks/tpch/run.sh
deleted file mode 100755
index c8a36b6..0000000
--- a/rust/ballista/rust/benchmarks/tpch/run.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-set -e
-
-# This bash script is meant to be run inside the docker-compose environment. Check the README for instructions
-
-for query in 1 3 5 6 10 12
-do
- /tpch benchmark --host ballista-scheduler --port 50050 --query $query --path /data --format tbl --iterations 1 --debug
-done
diff --git a/rust/ballista/rust/benchmarks/tpch/src/main.rs b/rust/ballista/rust/benchmarks/tpch/src/main.rs
deleted file mode 100644
index 1ba46ea..0000000
--- a/rust/ballista/rust/benchmarks/tpch/src/main.rs
+++ /dev/null
@@ -1,360 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Benchmark derived from TPC-H. This is not an official TPC-H benchmark.
-//!
-//! This is a modified version of the DataFusion version of these benchmarks.
-
-use std::collections::HashMap;
-use std::fs;
-use std::path::{Path, PathBuf};
-use std::time::Instant;
-
-use arrow::datatypes::{DataType, Field, Schema};
-use arrow::util::pretty;
-use ballista::prelude::*;
-use datafusion::prelude::*;
-use parquet::basic::Compression;
-use parquet::file::properties::WriterProperties;
-use structopt::StructOpt;
-
-#[derive(Debug, StructOpt)]
-struct BenchmarkOpt {
- /// Ballista executor host
- #[structopt(long = "host")]
- host: String,
-
- /// Ballista executor port
- #[structopt(long = "port")]
- port: u16,
-
- /// Query number
- #[structopt(long)]
- query: usize,
-
- /// Activate debug mode to see query results
- #[structopt(long)]
- debug: bool,
-
- /// Number of iterations of each test run
- #[structopt(long = "iterations", default_value = "1")]
- iterations: usize,
-
- /// Batch size when reading CSV or Parquet files
- #[structopt(long = "batch-size", default_value = "32768")]
- batch_size: usize,
-
- /// Path to data files
- #[structopt(parse(from_os_str), required = true, long = "path")]
- path: PathBuf,
-
- /// File format: `csv`, `tbl` or `parquet`
- #[structopt(long = "format")]
- file_format: String,
-}
-
-#[derive(Debug, StructOpt)]
-struct ConvertOpt {
- /// Path to csv files
- #[structopt(parse(from_os_str), required = true, short = "i", long = "input")]
- input_path: PathBuf,
-
- /// Output path
- #[structopt(parse(from_os_str), required = true, short = "o", long = "output")]
- output_path: PathBuf,
-
- /// Output file format: `csv` or `parquet`
- #[structopt(short = "f", long = "format")]
- file_format: String,
-
- /// Compression to use when writing Parquet files
- #[structopt(short = "c", long = "compression", default_value = "snappy")]
- compression: String,
-
- /// Number of partitions to produce
- #[structopt(short = "p", long = "partitions", default_value = "1")]
- partitions: usize,
-
- /// Batch size when reading CSV or Parquet files
- #[structopt(short = "s", long = "batch-size", default_value = "4096")]
- batch_size: usize,
-}
-
-#[derive(Debug, StructOpt)]
-#[structopt(name = "TPC-H", about = "TPC-H Benchmarks.")]
-enum TpchOpt {
- Benchmark(BenchmarkOpt),
- Convert(ConvertOpt),
-}
-
-const TABLES: &[&str] = &[
- "part", "supplier", "partsupp", "customer", "orders", "lineitem", "nation", "region",
-];
-
-#[tokio::main]
-async fn main() -> Result<()> {
- env_logger::init();
- match TpchOpt::from_args() {
- TpchOpt::Benchmark(opt) => benchmark(opt).await.map(|_| ()),
- TpchOpt::Convert(opt) => convert_tbl(opt).await,
- }
-}
-
-async fn benchmark(opt: BenchmarkOpt) -> Result<()> {
- println!("Running benchmarks with the following options: {:?}", opt);
-
- let mut settings = HashMap::new();
- settings.insert("batch.size".to_owned(), format!("{}", opt.batch_size));
-
- let ctx = BallistaContext::remote(opt.host.as_str(), opt.port, settings);
-
- // register tables with Ballista context
- let path = opt.path.to_str().unwrap();
- let file_format = opt.file_format.as_str();
- for table in TABLES {
- match file_format {
- // dbgen creates .tbl ('|' delimited) files without header
- "tbl" => {
- let path = format!("{}/{}.tbl", path, table);
- let schema = get_schema(table);
- let options = CsvReadOptions::new()
- .schema(&schema)
- .delimiter(b'|')
- .has_header(false)
- .file_extension(".tbl");
- ctx.register_csv(table, &path, options)?;
- }
- "csv" => {
- let path = format!("{}/{}", path, table);
- let schema = get_schema(table);
- let options = CsvReadOptions::new().schema(&schema).has_header(true);
- ctx.register_csv(table, &path, options)?;
- }
- "parquet" => {
- let path = format!("{}/{}", path, table);
- ctx.register_parquet(table, &path)?;
- }
- other => {
- unimplemented!("Invalid file format '{}'", other);
- }
- }
- }
-
- let mut millis = vec![];
-
- // run benchmark
- let sql = get_query_sql(opt.query)?;
- println!("Running benchmark with query {}:\n {}", opt.query, sql);
- for i in 0..opt.iterations {
- let start = Instant::now();
- let df = ctx.sql(&sql)?;
- let mut batches = vec![];
- let mut stream = df.collect().await?;
- while let Some(result) = stream.next().await {
- let batch = result?;
- batches.push(batch);
- }
- let elapsed = start.elapsed().as_secs_f64() * 1000.0;
- millis.push(elapsed as f64);
- println!("Query {} iteration {} took {:.1} ms", opt.query, i, elapsed);
- if opt.debug {
- pretty::print_batches(&batches)?;
- }
- }
-
- let avg = millis.iter().sum::<f64>() / millis.len() as f64;
- println!("Query {} avg time: {:.2} ms", opt.query, avg);
-
- Ok(())
-}
-
-fn get_query_sql(query: usize) -> Result<String> {
- if query > 0 && query < 23 {
- let filename = format!("queries/q{}.sql", query);
- Ok(fs::read_to_string(&filename).expect("failed to read query"))
- } else {
- Err(BallistaError::General(
- "invalid query. Expected value between 1 and 22".to_owned(),
- ))
- }
-}
-
-async fn convert_tbl(opt: ConvertOpt) -> Result<()> {
- let output_root_path = Path::new(&opt.output_path);
- for table in TABLES {
- let start = Instant::now();
- let schema = get_schema(table);
-
- let input_path = format!("{}/{}.tbl", opt.input_path.to_str().unwrap(), table);
- let options = CsvReadOptions::new()
- .schema(&schema)
- .delimiter(b'|')
- .file_extension(".tbl");
-
- let config = ExecutionConfig::new().with_batch_size(opt.batch_size);
- let mut ctx = ExecutionContext::with_config(config);
-
- // build plan to read the TBL file
- let mut csv = ctx.read_csv(&input_path, options)?;
-
- // optionally, repartition the file
- if opt.partitions > 1 {
- csv = csv.repartition(Partitioning::RoundRobinBatch(opt.partitions))?
- }
-
- // create the physical plan
- let csv = csv.to_logical_plan();
- let csv = ctx.optimize(&csv)?;
- let csv = ctx.create_physical_plan(&csv)?;
-
- let output_path = output_root_path.join(table);
- let output_path = output_path.to_str().unwrap().to_owned();
-
- println!(
- "Converting '{}' to {} files in directory '{}'",
- &input_path, &opt.file_format, &output_path
- );
- match opt.file_format.as_str() {
- "csv" => ctx.write_csv(csv, output_path).await?,
- "parquet" => {
- let compression = match opt.compression.as_str() {
- "none" => Compression::UNCOMPRESSED,
- "snappy" => Compression::SNAPPY,
- "brotli" => Compression::BROTLI,
- "gzip" => Compression::GZIP,
- "lz4" => Compression::LZ4,
- "lz0" => Compression::LZO,
- "zstd" => Compression::ZSTD,
- other => {
- return Err(BallistaError::NotImplemented(format!(
- "Invalid compression format: {}",
- other
- )))
- }
- };
- let props = WriterProperties::builder()
- .set_compression(compression)
- .build();
- ctx.write_parquet(csv, output_path, Some(props)).await?
- }
- other => {
- return Err(BallistaError::NotImplemented(format!(
- "Invalid output format: {}",
- other
- )))
- }
- }
- println!("Conversion completed in {} ms", start.elapsed().as_millis());
- }
-
- Ok(())
-}
-
-fn get_schema(table: &str) -> Schema {
- // note that the schema intentionally uses signed integers so that any generated Parquet
- // files can also be used to benchmark tools that only support signed integers, such as
- // Apache Spark
-
- match table {
- "part" => Schema::new(vec![
- Field::new("p_partkey", DataType::Int32, false),
- Field::new("p_name", DataType::Utf8, false),
- Field::new("p_mfgr", DataType::Utf8, false),
- Field::new("p_brand", DataType::Utf8, false),
- Field::new("p_type", DataType::Utf8, false),
- Field::new("p_size", DataType::Int32, false),
- Field::new("p_container", DataType::Utf8, false),
- Field::new("p_retailprice", DataType::Float64, false),
- Field::new("p_comment", DataType::Utf8, false),
- ]),
-
- "supplier" => Schema::new(vec![
- Field::new("s_suppkey", DataType::Int32, false),
- Field::new("s_name", DataType::Utf8, false),
- Field::new("s_address", DataType::Utf8, false),
- Field::new("s_nationkey", DataType::Int32, false),
- Field::new("s_phone", DataType::Utf8, false),
- Field::new("s_acctbal", DataType::Float64, false),
- Field::new("s_comment", DataType::Utf8, false),
- ]),
-
- "partsupp" => Schema::new(vec![
- Field::new("ps_partkey", DataType::Int32, false),
- Field::new("ps_suppkey", DataType::Int32, false),
- Field::new("ps_availqty", DataType::Int32, false),
- Field::new("ps_supplycost", DataType::Float64, false),
- Field::new("ps_comment", DataType::Utf8, false),
- ]),
-
- "customer" => Schema::new(vec![
- Field::new("c_custkey", DataType::Int32, false),
- Field::new("c_name", DataType::Utf8, false),
- Field::new("c_address", DataType::Utf8, false),
- Field::new("c_nationkey", DataType::Int32, false),
- Field::new("c_phone", DataType::Utf8, false),
- Field::new("c_acctbal", DataType::Float64, false),
- Field::new("c_mktsegment", DataType::Utf8, false),
- Field::new("c_comment", DataType::Utf8, false),
- ]),
-
- "orders" => Schema::new(vec![
- Field::new("o_orderkey", DataType::Int32, false),
- Field::new("o_custkey", DataType::Int32, false),
- Field::new("o_orderstatus", DataType::Utf8, false),
- Field::new("o_totalprice", DataType::Float64, false),
- Field::new("o_orderdate", DataType::Date32, false),
- Field::new("o_orderpriority", DataType::Utf8, false),
- Field::new("o_clerk", DataType::Utf8, false),
- Field::new("o_shippriority", DataType::Int32, false),
- Field::new("o_comment", DataType::Utf8, false),
- ]),
-
- "lineitem" => Schema::new(vec![
- Field::new("l_orderkey", DataType::Int32, false),
- Field::new("l_partkey", DataType::Int32, false),
- Field::new("l_suppkey", DataType::Int32, false),
- Field::new("l_linenumber", DataType::Int32, false),
- Field::new("l_quantity", DataType::Float64, false),
- Field::new("l_extendedprice", DataType::Float64, false),
- Field::new("l_discount", DataType::Float64, false),
- Field::new("l_tax", DataType::Float64, false),
- Field::new("l_returnflag", DataType::Utf8, false),
- Field::new("l_linestatus", DataType::Utf8, false),
- Field::new("l_shipdate", DataType::Date32, false),
- Field::new("l_commitdate", DataType::Date32, false),
- Field::new("l_receiptdate", DataType::Date32, false),
- Field::new("l_shipinstruct", DataType::Utf8, false),
- Field::new("l_shipmode", DataType::Utf8, false),
- Field::new("l_comment", DataType::Utf8, false),
- ]),
-
- "nation" => Schema::new(vec![
- Field::new("n_nationkey", DataType::Int32, false),
- Field::new("n_name", DataType::Utf8, false),
- Field::new("n_regionkey", DataType::Int32, false),
- Field::new("n_comment", DataType::Utf8, false),
- ]),
-
- "region" => Schema::new(vec![
- Field::new("r_regionkey", DataType::Int32, false),
- Field::new("r_name", DataType::Utf8, false),
- Field::new("r_comment", DataType::Utf8, false),
- ]),
-
- _ => unimplemented!(),
- }
-}
diff --git a/rust/ballista/rust/benchmarks/tpch/tpch-gen.sh b/rust/ballista/rust/benchmarks/tpch/tpch-gen.sh
deleted file mode 100755
index f5147f5..0000000
--- a/rust/ballista/rust/benchmarks/tpch/tpch-gen.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-BALLISTA_VERSION=0.4.2-SNAPSHOT
-
-#set -e
-
-docker build -t ballistacompute/ballista-tpchgen:$BALLISTA_VERSION -f tpchgen.dockerfile .
-
-# Generate data into the ./data directory if it does not already exist
-FILE=./data/supplier.tbl
-if test -f "$FILE"; then
- echo "$FILE exists."
-else
- mkdir data 2>/dev/null
- docker run -v `pwd`/data:/data -it --rm ballistacompute/ballista-tpchgen:$BALLISTA_VERSION
- ls -l data
-fi
\ No newline at end of file
diff --git a/rust/ballista/rust/benchmarks/tpch/tpchgen.dockerfile b/rust/ballista/rust/benchmarks/tpch/tpchgen.dockerfile
deleted file mode 100644
index 7fc2e50..0000000
--- a/rust/ballista/rust/benchmarks/tpch/tpchgen.dockerfile
+++ /dev/null
@@ -1,32 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-FROM ubuntu
-
-RUN apt-get update && \
- apt-get install -y git build-essential
-
-RUN git clone https://github.com/databricks/tpch-dbgen.git && \
- cd tpch-dbgen && \
- make
-
-WORKDIR /tpch-dbgen
-ADD entrypoint.sh /tpch-dbgen/
-
-VOLUME data
-
-ENTRYPOINT [ "bash", "./entrypoint.sh" ]
diff --git a/rust/ballista/rust/client/Cargo.toml b/rust/ballista/rust/client/Cargo.toml
deleted file mode 100644
index de3effe..0000000
--- a/rust/ballista/rust/client/Cargo.toml
+++ /dev/null
@@ -1,35 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-[package]
-name = "ballista"
-description = "Ballista Distributed Compute"
-license = "Apache-2.0"
-version = "0.4.2-SNAPSHOT"
-homepage = "https://github.com/apache/arrow"
-repository = "https://github.com/apache/arrow"
-authors = ["Apache Arrow <de...@arrow.apache.org>"]
-edition = "2018"
-
-[dependencies]
-ballista-core = { path = "../core" }
-futures = "0.3"
-log = "0.4"
-tokio = "1.0"
-
-arrow = { path = "../../../arrow" }
-datafusion = { path = "../../../datafusion" }
\ No newline at end of file
diff --git a/rust/ballista/rust/client/README.md b/rust/ballista/rust/client/README.md
deleted file mode 100644
index 00bf3ea..0000000
--- a/rust/ballista/rust/client/README.md
+++ /dev/null
@@ -1,22 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-# Ballista - Rust
-This crate contains the Ballista client library. For an example usage, please refer [here](../benchmarks/tpch/README.md).
-
diff --git a/rust/ballista/rust/client/src/columnar_batch.rs b/rust/ballista/rust/client/src/columnar_batch.rs
deleted file mode 100644
index d3ff886..0000000
--- a/rust/ballista/rust/client/src/columnar_batch.rs
+++ /dev/null
@@ -1,167 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::{collections::HashMap, sync::Arc};
-
-use ballista_core::error::{ballista_error, Result};
-
-use arrow::{
- array::ArrayRef,
- datatypes::{DataType, Schema},
- record_batch::RecordBatch,
-};
-use datafusion::scalar::ScalarValue;
-
-pub type MaybeColumnarBatch = Result<Option<ColumnarBatch>>;
-
-/// Batch of columnar data.
-#[allow(dead_code)]
-#[derive(Debug, Clone)]
-
-pub struct ColumnarBatch {
- schema: Arc<Schema>,
- columns: HashMap<String, ColumnarValue>,
-}
-
-impl ColumnarBatch {
- pub fn from_arrow(batch: &RecordBatch) -> Self {
- let columns = batch
- .columns()
- .iter()
- .enumerate()
- .map(|(i, array)| {
- (
- batch.schema().field(i).name().clone(),
- ColumnarValue::Columnar(array.clone()),
- )
- })
- .collect();
-
- Self {
- schema: batch.schema(),
- columns,
- }
- }
-
- pub fn from_values(values: &[ColumnarValue], schema: &Schema) -> Self {
- let columns = schema
- .fields()
- .iter()
- .enumerate()
- .map(|(i, f)| (f.name().clone(), values[i].clone()))
- .collect();
-
- Self {
- schema: Arc::new(schema.clone()),
- columns,
- }
- }
-
- pub fn to_arrow(&self) -> Result<RecordBatch> {
- let arrays = self
- .schema
- .fields()
- .iter()
- .map(|c| {
- match self.column(c.name())? {
- ColumnarValue::Columnar(array) => Ok(array.clone()),
- ColumnarValue::Scalar(_, _) => {
- // note that this can be implemented easily if needed
- Err(ballista_error("Cannot convert scalar value to Arrow array"))
- }
- }
- })
- .collect::<Result<Vec<_>>>()?;
-
- Ok(RecordBatch::try_new(self.schema.clone(), arrays)?)
- }
-
- pub fn schema(&self) -> Arc<Schema> {
- self.schema.clone()
- }
-
- pub fn num_columns(&self) -> usize {
- self.columns.len()
- }
-
- pub fn num_rows(&self) -> usize {
- self.columns[self.schema.field(0).name()].len()
- }
-
- pub fn column(&self, name: &str) -> Result<&ColumnarValue> {
- Ok(&self.columns[name])
- }
-
- pub fn memory_size(&self) -> usize {
- self.columns.values().map(|c| c.memory_size()).sum()
- }
-}
-
-/// A columnar value can either be a scalar value or an Arrow array.
-#[allow(dead_code)]
-#[derive(Debug, Clone)]
-
-pub enum ColumnarValue {
- Scalar(ScalarValue, usize),
- Columnar(ArrayRef),
-}
-
-impl ColumnarValue {
- pub fn len(&self) -> usize {
- match self {
- ColumnarValue::Scalar(_, n) => *n,
- ColumnarValue::Columnar(array) => array.len(),
- }
- }
-
- pub fn is_empty(&self) -> bool {
- self.len() == 0
- }
-
- pub fn data_type(&self) -> &DataType {
- match self {
- ColumnarValue::Columnar(array) => array.data_type(),
- ColumnarValue::Scalar(value, _) => match value {
- ScalarValue::UInt8(_) => &DataType::UInt8,
- ScalarValue::UInt16(_) => &DataType::UInt16,
- ScalarValue::UInt32(_) => &DataType::UInt32,
- ScalarValue::UInt64(_) => &DataType::UInt64,
- ScalarValue::Int8(_) => &DataType::Int8,
- ScalarValue::Int16(_) => &DataType::Int16,
- ScalarValue::Int32(_) => &DataType::Int32,
- ScalarValue::Int64(_) => &DataType::Int64,
- ScalarValue::Float32(_) => &DataType::Float32,
- ScalarValue::Float64(_) => &DataType::Float64,
- _ => unimplemented!(),
- },
- }
- }
-
- pub fn to_arrow(&self) -> ArrayRef {
- match self {
- ColumnarValue::Columnar(array) => array.clone(),
- ColumnarValue::Scalar(value, n) => value.to_array_of_size(*n),
- }
- }
-
- pub fn memory_size(&self) -> usize {
- match self {
- ColumnarValue::Columnar(array) => array.get_array_memory_size(),
- _ => 0,
- }
- }
-}
diff --git a/rust/ballista/rust/client/src/context.rs b/rust/ballista/rust/client/src/context.rs
deleted file mode 100644
index 400f6b6..0000000
--- a/rust/ballista/rust/client/src/context.rs
+++ /dev/null
@@ -1,400 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Distributed execution context.
-
-use std::path::PathBuf;
-use std::pin::Pin;
-use std::sync::{Arc, Mutex};
-use std::{collections::HashMap, convert::TryInto};
-use std::{fs, time::Duration};
-
-use ballista_core::serde::protobuf::scheduler_grpc_client::SchedulerGrpcClient;
-use ballista_core::serde::protobuf::{
- execute_query_params::Query, job_status, ExecuteQueryParams, GetJobStatusParams,
- GetJobStatusResult,
-};
-use ballista_core::{
- client::BallistaClient,
- datasource::DFTableAdapter,
- error::{BallistaError, Result},
- memory_stream::MemoryStream,
- utils::create_datafusion_context,
-};
-
-use arrow::datatypes::Schema;
-use datafusion::catalog::TableReference;
-use datafusion::logical_plan::{DFSchema, Expr, LogicalPlan, Partitioning};
-use datafusion::physical_plan::csv::CsvReadOptions;
-use datafusion::{dataframe::DataFrame, physical_plan::RecordBatchStream};
-use log::{error, info};
-
-#[allow(dead_code)]
-struct BallistaContextState {
- /// Scheduler host
- scheduler_host: String,
- /// Scheduler port
- scheduler_port: u16,
- /// Tables that have been registered with this context
- tables: HashMap<String, LogicalPlan>,
- /// General purpose settings
- settings: HashMap<String, String>,
-}
-
-impl BallistaContextState {
- pub fn new(
- scheduler_host: String,
- scheduler_port: u16,
- settings: HashMap<String, String>,
- ) -> Self {
- Self {
- scheduler_host,
- scheduler_port,
- tables: HashMap::new(),
- settings,
- }
- }
-}
-
-#[allow(dead_code)]
-
-pub struct BallistaContext {
- state: Arc<Mutex<BallistaContextState>>,
-}
-
-impl BallistaContext {
- /// Create a context for executing queries against a remote Ballista scheduler instance
- pub fn remote(host: &str, port: u16, settings: HashMap<String, String>) -> Self {
- let state = BallistaContextState::new(host.to_owned(), port, settings);
-
- Self {
- state: Arc::new(Mutex::new(state)),
- }
- }
-
- /// Create a DataFrame representing a Parquet table scan
-
- pub fn read_parquet(&self, path: &str) -> Result<BallistaDataFrame> {
- // convert to absolute path because the executor likely has a different working directory
- let path = PathBuf::from(path);
- let path = fs::canonicalize(&path)?;
-
- // use local DataFusion context for now but later this might call the scheduler
- let mut ctx = create_datafusion_context();
- let df = ctx.read_parquet(path.to_str().unwrap())?;
- Ok(BallistaDataFrame::from(self.state.clone(), df))
- }
-
- /// Create a DataFrame representing a CSV table scan
-
- pub fn read_csv(
- &self,
- path: &str,
- options: CsvReadOptions,
- ) -> Result<BallistaDataFrame> {
- // convert to absolute path because the executor likely has a different working directory
- let path = PathBuf::from(path);
- let path = fs::canonicalize(&path)?;
-
- // use local DataFusion context for now but later this might call the scheduler
- let mut ctx = create_datafusion_context();
- let df = ctx.read_csv(path.to_str().unwrap(), options)?;
- Ok(BallistaDataFrame::from(self.state.clone(), df))
- }
-
- /// Register a DataFrame as a table that can be referenced from a SQL query
- pub fn register_table(&self, name: &str, table: &BallistaDataFrame) -> Result<()> {
- let mut state = self.state.lock().unwrap();
- state
- .tables
- .insert(name.to_owned(), table.to_logical_plan());
- Ok(())
- }
-
- pub fn register_csv(
- &self,
- name: &str,
- path: &str,
- options: CsvReadOptions,
- ) -> Result<()> {
- let df = self.read_csv(path, options)?;
- self.register_table(name, &df)
- }
-
- pub fn register_parquet(&self, name: &str, path: &str) -> Result<()> {
- let df = self.read_parquet(path)?;
- self.register_table(name, &df)
- }
-
- /// Create a DataFrame from a SQL statement
- pub fn sql(&self, sql: &str) -> Result<BallistaDataFrame> {
- // use local DataFusion context for now but later this might call the scheduler
- let mut ctx = create_datafusion_context();
- // register tables
- let state = self.state.lock().unwrap();
- for (name, plan) in &state.tables {
- let plan = ctx.optimize(plan)?;
- let execution_plan = ctx.create_physical_plan(&plan)?;
- ctx.register_table(
- TableReference::Bare { table: name },
- Arc::new(DFTableAdapter::new(plan, execution_plan)),
- )?;
- }
- let df = ctx.sql(sql)?;
- Ok(BallistaDataFrame::from(self.state.clone(), df))
- }
-}
-
-/// The Ballista DataFrame is a wrapper around the DataFusion DataFrame and overrides the
-/// `collect` method so that the query is executed against Ballista and not DataFusion.
-
-pub struct BallistaDataFrame {
- /// Ballista context state
- state: Arc<Mutex<BallistaContextState>>,
- /// DataFusion DataFrame representing logical query plan
- df: Arc<dyn DataFrame>,
-}
-
-impl BallistaDataFrame {
- fn from(state: Arc<Mutex<BallistaContextState>>, df: Arc<dyn DataFrame>) -> Self {
- Self { state, df }
- }
-
- pub async fn collect(&self) -> Result<Pin<Box<dyn RecordBatchStream + Send + Sync>>> {
- let scheduler_url = {
- let state = self.state.lock().unwrap();
-
- format!("http://{}:{}", state.scheduler_host, state.scheduler_port)
- };
-
- info!("Connecting to Ballista scheduler at {}", scheduler_url);
-
- let mut scheduler = SchedulerGrpcClient::connect(scheduler_url).await?;
-
- let plan = self.df.to_logical_plan();
- let schema: Schema = plan.schema().as_ref().clone().into();
-
- let job_id = scheduler
- .execute_query(ExecuteQueryParams {
- query: Some(Query::LogicalPlan((&plan).try_into()?)),
- })
- .await?
- .into_inner()
- .job_id;
-
- loop {
- let GetJobStatusResult { status } = scheduler
- .get_job_status(GetJobStatusParams {
- job_id: job_id.clone(),
- })
- .await?
- .into_inner();
- let status = status.and_then(|s| s.status).ok_or_else(|| {
- BallistaError::Internal("Received empty status message".to_owned())
- })?;
- let wait_future = tokio::time::sleep(Duration::from_millis(100));
- match status {
- job_status::Status::Queued(_) => {
- info!("Job {} still queued...", job_id);
- wait_future.await;
- }
- job_status::Status::Running(_) => {
- info!("Job {} is running...", job_id);
- wait_future.await;
- }
- job_status::Status::Failed(err) => {
- let msg = format!("Job {} failed: {}", job_id, err.error);
- error!("{}", msg);
- break Err(BallistaError::General(msg));
- }
- job_status::Status::Completed(completed) => {
- // TODO: use streaming. Probably need to change the signature of fetch_partition to achieve that
- let mut result = vec![];
- for location in completed.partition_location {
- let metadata = location.executor_meta.ok_or_else(|| {
- BallistaError::Internal(
- "Received empty executor metadata".to_owned(),
- )
- })?;
- let partition_id = location.partition_id.ok_or_else(|| {
- BallistaError::Internal(
- "Received empty partition id".to_owned(),
- )
- })?;
- let mut ballista_client = BallistaClient::try_new(
- metadata.host.as_str(),
- metadata.port as u16,
- )
- .await?;
- let stream = ballista_client
- .fetch_partition(
- &partition_id.job_id,
- partition_id.stage_id as usize,
- partition_id.partition_id as usize,
- )
- .await?;
- result.append(
- &mut datafusion::physical_plan::common::collect(stream)
- .await?,
- );
- }
- break Ok(Box::pin(MemoryStream::try_new(
- result,
- Arc::new(schema),
- None,
- )?));
- }
- };
- }
- }
-
- pub fn select_columns(&self, columns: &[&str]) -> Result<BallistaDataFrame> {
- Ok(Self::from(
- self.state.clone(),
- self.df
- .select_columns(columns)
- .map_err(BallistaError::from)?,
- ))
- }
-
- pub fn select(&self, expr: Vec<Expr>) -> Result<BallistaDataFrame> {
- Ok(Self::from(
- self.state.clone(),
- self.df.select(expr).map_err(BallistaError::from)?,
- ))
- }
-
- pub fn filter(&self, expr: Expr) -> Result<BallistaDataFrame> {
- Ok(Self::from(
- self.state.clone(),
- self.df.filter(expr).map_err(BallistaError::from)?,
- ))
- }
-
- pub fn aggregate(
- &self,
- group_expr: Vec<Expr>,
- aggr_expr: Vec<Expr>,
- ) -> Result<BallistaDataFrame> {
- Ok(Self::from(
- self.state.clone(),
- self.df
- .aggregate(group_expr, aggr_expr)
- .map_err(BallistaError::from)?,
- ))
- }
-
- pub fn limit(&self, n: usize) -> Result<BallistaDataFrame> {
- Ok(Self::from(
- self.state.clone(),
- self.df.limit(n).map_err(BallistaError::from)?,
- ))
- }
-
- pub fn sort(&self, expr: Vec<Expr>) -> Result<BallistaDataFrame> {
- Ok(Self::from(
- self.state.clone(),
- self.df.sort(expr).map_err(BallistaError::from)?,
- ))
- }
-
- // TODO lifetime issue
- // pub fn join(&self, right: Arc<dyn DataFrame>, join_type: JoinType, left_cols: &[&str], right_cols: &[&str]) ->
- // Result<BallistaDataFrame> { Ok(Self::from(self.state.clone(), self.df.join(right, join_type, &left_cols,
- // &right_cols).map_err(BallistaError::from)?)) }
-
- pub fn repartition(
- &self,
- partitioning_scheme: Partitioning,
- ) -> Result<BallistaDataFrame> {
- Ok(Self::from(
- self.state.clone(),
- self.df
- .repartition(partitioning_scheme)
- .map_err(BallistaError::from)?,
- ))
- }
-
- pub fn schema(&self) -> &DFSchema {
- self.df.schema()
- }
-
- pub fn to_logical_plan(&self) -> LogicalPlan {
- self.df.to_logical_plan()
- }
-
- pub fn explain(&self, verbose: bool) -> Result<BallistaDataFrame> {
- Ok(Self::from(
- self.state.clone(),
- self.df.explain(verbose).map_err(BallistaError::from)?,
- ))
- }
-}
-
-// #[async_trait]
-// impl ExecutionContext for BallistaContext {
-// async fn get_executor_ids(&self) -> Result<Vec<ExecutorMeta>> {
-// match &self.config.discovery_mode {
-// DiscoveryMode::Etcd => etcd_get_executors(&self.config.etcd_urls, "default").await,
-// DiscoveryMode::Kubernetes => k8s_get_executors("default", "ballista").await,
-// DiscoveryMode::Standalone => Err(ballista_error("Standalone mode not implemented yet")),
-// }
-// }
-//
-// async fn execute_task(
-// &self,
-// executor_meta: ExecutorMeta,
-// task: ExecutionTask,
-// ) -> Result<ShuffleId> {
-// // TODO what is the point of returning this info since it is based on input arg?
-// let shuffle_id = ShuffleId::new(task.job_uuid, task.stage_id, task.partition_id);
-//
-// let _ = execute_action(
-// &executor_meta.host,
-// executor_meta.port,
-// &Action::Execute(task),
-// )
-// .await?;
-//
-// Ok(shuffle_id)
-// }
-//
-// async fn read_shuffle(&self, shuffle_id: &ShuffleId) -> Result<Vec<ColumnarBatch>> {
-// match self.shuffle_locations.get(shuffle_id) {
-// Some(executor_meta) => {
-// let batches = execute_action(
-// &executor_meta.host,
-// executor_meta.port,
-// &Action::FetchShuffle(*shuffle_id),
-// )
-// .await?;
-// Ok(batches
-// .iter()
-// .map(|b| ColumnarBatch::from_arrow(b))
-// .collect())
-// }
-// _ => Err(ballista_error(&format!(
-// "Failed to resolve executor UUID for shuffle ID {:?}",
-// shuffle_id
-// ))),
-// }
-// }
-//
-// fn config(&self) -> ExecutorConfig {
-// self.config.clone()
-// }
-// }
diff --git a/rust/ballista/rust/client/src/lib.rs b/rust/ballista/rust/client/src/lib.rs
deleted file mode 100644
index c3c6291..0000000
--- a/rust/ballista/rust/client/src/lib.rs
+++ /dev/null
@@ -1,20 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-pub mod columnar_batch;
-pub mod context;
-pub mod prelude;
diff --git a/rust/ballista/rust/client/src/prelude.rs b/rust/ballista/rust/client/src/prelude.rs
deleted file mode 100644
index 2f940ae..0000000
--- a/rust/ballista/rust/client/src/prelude.rs
+++ /dev/null
@@ -1,23 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Ballista Prelude (common imports)
-
-pub use crate::context::BallistaContext;
-pub use ballista_core::error::{BallistaError, Result};
-
-pub use futures::StreamExt;
diff --git a/rust/ballista/rust/core/Cargo.toml b/rust/ballista/rust/core/Cargo.toml
deleted file mode 100644
index e37a1ea..0000000
--- a/rust/ballista/rust/core/Cargo.toml
+++ /dev/null
@@ -1,50 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-[package]
-name = "ballista-core"
-description = "Ballista Distributed Compute"
-license = "Apache-2.0"
-version = "0.4.2-SNAPSHOT"
-homepage = "https://github.com/apache/arrow"
-repository = "https://github.com/apache/arrow"
-authors = ["Apache Arrow <de...@arrow.apache.org>"]
-edition = "2018"
-build = "build.rs"
-
-[features]
-simd = ["datafusion/simd"]
-
-[dependencies]
-async-trait = "0.1.36"
-futures = "0.3"
-log = "0.4"
-prost = "0.7"
-serde = {version = "1", features = ["derive"]}
-sqlparser = "0.8"
-tokio = "1.0"
-tonic = "0.4"
-uuid = { version = "0.8", features = ["v4"] }
-
-arrow = { path = "../../../arrow" }
-arrow-flight = { path = "../../../arrow-flight" }
-datafusion = { path = "../../../datafusion" }
-
-[dev-dependencies]
-
-[build-dependencies]
-tonic-build = { version = "0.4" }
diff --git a/rust/ballista/rust/core/README.md b/rust/ballista/rust/core/README.md
deleted file mode 100644
index f97952b..0000000
--- a/rust/ballista/rust/core/README.md
+++ /dev/null
@@ -1,21 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-# Ballista - Rust
-This crate contains the core Ballista types.
diff --git a/rust/ballista/rust/core/build.rs b/rust/ballista/rust/core/build.rs
deleted file mode 100644
index 6ad153e..0000000
--- a/rust/ballista/rust/core/build.rs
+++ /dev/null
@@ -1,26 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-fn main() -> Result<(), String> {
- // for use in docker build where file changes can be wonky
- println!("cargo:rerun-if-env-changed=FORCE_REBUILD");
-
- println!("cargo:rerun-if-changed=proto/ballista.proto");
- tonic_build::configure()
- .compile(&["proto/ballista.proto"], &["proto"])
- .map_err(|e| format!("protobuf compilation failed: {}", e))
-}
diff --git a/rust/ballista/rust/core/proto/ballista.proto b/rust/ballista/rust/core/proto/ballista.proto
deleted file mode 100644
index 5733921..0000000
--- a/rust/ballista/rust/core/proto/ballista.proto
+++ /dev/null
@@ -1,824 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- * <p>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-syntax = "proto3";
-
-package ballista.protobuf;
-
-option java_multiple_files = true;
-option java_package = "org.ballistacompute.protobuf";
-option java_outer_classname = "BallistaProto";
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Ballista Logical Plan
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-// logical expressions
-message LogicalExprNode {
- oneof ExprType {
- // column references
- string column_name = 1;
-
- // alias
- AliasNode alias = 2;
-
- ScalarValue literal = 3;
-
-
- // binary expressions
- BinaryExprNode binary_expr = 4;
-
- // aggregate expressions
- AggregateExprNode aggregate_expr = 5;
-
- // null checks
- IsNull is_null_expr = 6;
- IsNotNull is_not_null_expr = 7;
- Not not_expr = 8;
-
- BetweenNode between = 9;
- CaseNode case_ = 10;
- CastNode cast = 11;
- SortExprNode sort = 12;
- NegativeNode negative = 13;
- InListNode in_list = 14;
- bool wildcard = 15;
- ScalarFunctionNode scalar_function = 16;
- TryCastNode try_cast = 17;
- }
-}
-
-message IsNull {
- LogicalExprNode expr = 1;
-}
-
-message IsNotNull {
- LogicalExprNode expr = 1;
-}
-
-message Not {
- LogicalExprNode expr = 1;
-}
-
-message AliasNode {
- LogicalExprNode expr = 1;
- string alias = 2;
-}
-
-message BinaryExprNode {
- LogicalExprNode l = 1;
- LogicalExprNode r = 2;
- string op = 3;
-}
-
-message NegativeNode {
- LogicalExprNode expr = 1;
-}
-
-message InListNode {
- LogicalExprNode expr = 1;
- repeated LogicalExprNode list = 2;
- bool negated = 3;
-}
-
-enum ScalarFunction {
- SQRT = 0;
- SIN = 1;
- COS = 2;
- TAN = 3;
- ASIN = 4;
- ACOS = 5;
- ATAN = 6;
- EXP = 7;
- LOG = 8;
- LOG2 = 9;
- LOG10 = 10;
- FLOOR = 11;
- CEIL = 12;
- ROUND = 13;
- TRUNC = 14;
- ABS = 15;
- SIGNUM = 16;
- OCTETLENGTH = 17;
- CONCAT = 18;
- LOWER = 19;
- UPPER = 20;
- TRIM = 21;
- LTRIM = 22;
- RTRIM = 23;
- TOTIMESTAMP = 24;
- ARRAY = 25;
- NULLIF = 26;
- DATETRUNC = 27;
- MD5 = 28;
- SHA224 = 29;
- SHA256 = 30;
- SHA384 = 31;
- SHA512 = 32;
-}
-
-message ScalarFunctionNode {
- ScalarFunction fun = 1;
- repeated LogicalExprNode expr = 2;
-}
-
-enum AggregateFunction {
- MIN = 0;
- MAX = 1;
- SUM = 2;
- AVG = 3;
- COUNT = 4;
-}
-
-message AggregateExprNode {
- AggregateFunction aggr_function = 1;
- LogicalExprNode expr = 2;
-}
-
-message BetweenNode {
- LogicalExprNode expr = 1;
- bool negated = 2;
- LogicalExprNode low = 3;
- LogicalExprNode high = 4;
-}
-
-message CaseNode {
- LogicalExprNode expr = 1;
- repeated WhenThen when_then_expr = 2;
- LogicalExprNode else_expr = 3;
-}
-
-message WhenThen {
- LogicalExprNode when_expr = 1;
- LogicalExprNode then_expr = 2;
-}
-
-message CastNode {
- LogicalExprNode expr = 1;
- ArrowType arrow_type = 2;
-}
-
-message TryCastNode {
- LogicalExprNode expr = 1;
- ArrowType arrow_type = 2;
-}
-
-message SortExprNode {
- LogicalExprNode expr = 1;
- bool asc = 2;
- bool nulls_first = 3;
-}
-
-// LogicalPlan is a nested type
-message LogicalPlanNode {
- oneof LogicalPlanType {
- CsvTableScanNode csv_scan = 1;
- ParquetTableScanNode parquet_scan = 2;
- ProjectionNode projection = 3;
- SelectionNode selection = 4;
- LimitNode limit = 5;
- AggregateNode aggregate = 6;
- JoinNode join = 7;
- SortNode sort = 8;
- RepartitionNode repartition = 9;
- EmptyRelationNode empty_relation = 10;
- CreateExternalTableNode create_external_table = 11;
- ExplainNode explain = 12;
- }
-}
-
-message ProjectionColumns {
- repeated string columns = 1;
-}
-
-message CsvTableScanNode {
- string table_name = 1;
- string path = 2;
- bool has_header = 3;
- string delimiter = 4;
- string file_extension = 5;
- ProjectionColumns projection = 6;
- Schema schema = 7;
- repeated LogicalExprNode filters = 8;
-}
-
-message ParquetTableScanNode {
- string table_name = 1;
- string path = 2;
- ProjectionColumns projection = 3;
- Schema schema = 4;
- repeated LogicalExprNode filters = 5;
-}
-
-message ProjectionNode {
- LogicalPlanNode input = 1;
- repeated LogicalExprNode expr = 2;
-}
-
-message SelectionNode {
- LogicalPlanNode input = 1;
- LogicalExprNode expr = 2;
-}
-
-message SortNode{
- LogicalPlanNode input = 1;
- repeated LogicalExprNode expr = 2;
-}
-
-message RepartitionNode{
- LogicalPlanNode input = 1;
- oneof partition_method {
- uint64 round_robin = 2;
- HashRepartition hash = 3;
- }
-}
-
-message HashRepartition {
- repeated LogicalExprNode hash_expr = 1;
- uint64 partition_count = 2;
-}
-
-message EmptyRelationNode{
- bool produce_one_row = 1;
-}
-
-message CreateExternalTableNode{
- string name = 1;
- string location = 2;
- FileType file_type = 3;
- bool has_header = 4;
- Schema schema = 5;
-}
-
-enum FileType{
- NdJson = 0;
- Parquet = 1;
- CSV = 2;
-}
-
-message ExplainNode{
- LogicalPlanNode input = 1;
- bool verbose = 2;
-}
-
-message DfField{
- string qualifier = 2;
- Field field = 1;
-}
-
-message AggregateNode {
- LogicalPlanNode input = 1;
- repeated LogicalExprNode group_expr = 2;
- repeated LogicalExprNode aggr_expr = 3;
-}
-
-enum JoinType {
- INNER = 0;
- LEFT = 1;
- RIGHT = 2;
-}
-
-message JoinNode {
- LogicalPlanNode left = 1;
- LogicalPlanNode right = 2;
- JoinType join_type = 3;
- repeated string left_join_column = 4;
- repeated string right_join_column = 5;
-}
-
-message LimitNode {
- LogicalPlanNode input = 1;
- uint32 limit = 2;
-}
-
-message SelectionExecNode {
- LogicalExprNode expr = 1;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Ballista Physical Plan
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-// PhysicalPlanNode is a nested type
-message PhysicalPlanNode {
- oneof PhysicalPlanType {
- ParquetScanExecNode parquet_scan = 1;
- CsvScanExecNode csv_scan = 2;
- EmptyExecNode empty = 3;
- ProjectionExecNode projection = 4;
- GlobalLimitExecNode global_limit = 6;
- LocalLimitExecNode local_limit = 7;
- HashAggregateExecNode hash_aggregate = 8;
- HashJoinExecNode hash_join = 9;
- ShuffleReaderExecNode shuffle_reader = 10;
- SortExecNode sort = 11;
- CoalesceBatchesExecNode coalesce_batches = 12;
- FilterExecNode filter = 13;
- MergeExecNode merge = 14;
- UnresolvedShuffleExecNode unresolved = 15;
- RepartitionExecNode repartition = 16;
- }
-}
-
-message UnresolvedShuffleExecNode {
- repeated uint32 query_stage_ids = 1;
- Schema schema = 2;
- uint32 partition_count = 3;
-}
-
-message FilterExecNode {
- PhysicalPlanNode input = 1;
- LogicalExprNode expr = 2;
-}
-
-message ParquetScanExecNode {
- repeated string filename = 1;
- repeated uint32 projection = 2;
- uint32 num_partitions = 3;
- uint32 batch_size = 4;
-}
-
-message CsvScanExecNode {
- string path = 1;
- repeated uint32 projection = 2;
- Schema schema = 3;
- string file_extension = 4;
- bool has_header = 5;
- uint32 batch_size = 6;
- string delimiter = 7;
-
- // partition filenames
- repeated string filename = 8;
-}
-
-message HashJoinExecNode {
- PhysicalPlanNode left = 1;
- PhysicalPlanNode right = 2;
- repeated JoinOn on = 3;
- JoinType join_type = 4;
-
-}
-
-message JoinOn {
- string left = 1;
- string right = 2;
-}
-
-
-message EmptyExecNode {
- bool produce_one_row = 1;
- Schema schema = 2;
-}
-
-message ProjectionExecNode {
- PhysicalPlanNode input = 1;
- repeated LogicalExprNode expr = 2;
- repeated string expr_name = 3;
-}
-
-enum AggregateMode {
- PARTIAL = 0;
- FINAL = 1;
-}
-
-message HashAggregateExecNode {
- repeated LogicalExprNode group_expr = 1;
- repeated LogicalExprNode aggr_expr = 2;
- AggregateMode mode = 3;
- PhysicalPlanNode input = 4;
- repeated string group_expr_name = 5;
- repeated string aggr_expr_name = 6;
- // we need the input schema to the partial aggregate to pass to the final aggregate
- Schema input_schema = 7;
-}
-
-message ShuffleReaderExecNode {
- repeated PartitionLocation partition_location = 1;
- Schema schema = 2;
-}
-
-message GlobalLimitExecNode {
- PhysicalPlanNode input = 1;
- uint32 limit = 2;
-}
-
-message LocalLimitExecNode {
- PhysicalPlanNode input = 1;
- uint32 limit = 2;
-}
-
-message SortExecNode {
- PhysicalPlanNode input = 1;
- repeated LogicalExprNode expr = 2;
-}
-
-message CoalesceBatchesExecNode {
- PhysicalPlanNode input = 1;
- uint32 target_batch_size = 2;
-}
-
-message MergeExecNode {
- PhysicalPlanNode input = 1;
-}
-
-message RepartitionExecNode{
- PhysicalPlanNode input = 1;
- oneof partition_method {
- uint64 round_robin = 2;
- HashRepartition hash = 3;
- uint64 unknown = 4;
- }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Ballista Scheduling
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-message KeyValuePair {
- string key = 1;
- string value = 2;
-}
-
-message Action {
-
- oneof ActionType {
- // Execute a logical query plan
- LogicalPlanNode query = 1;
-
- // Execute one partition of a physical query plan
- ExecutePartition execute_partition = 2;
-
- // Fetch a partition from an executor
- PartitionId fetch_partition = 3;
- }
-
- // configuration settings
- repeated KeyValuePair settings = 100;
-}
-
-message ExecutePartition {
- string job_id = 1;
- uint32 stage_id = 2;
- repeated uint32 partition_id = 3;
- PhysicalPlanNode plan = 4;
- // The task could need to read partitions from other executors
- repeated PartitionLocation partition_location = 5;
-}
-
-// Mapping from partition id to executor id
-message PartitionLocation {
- PartitionId partition_id = 1;
- ExecutorMetadata executor_meta = 2;
- PartitionStats partition_stats = 3;
-}
-
-// Unique identifier for a materialized partition of data
-message PartitionId {
- string job_id = 1;
- uint32 stage_id = 2;
- uint32 partition_id = 4;
-}
-
-message PartitionStats {
- int64 num_rows = 1;
- int64 num_batches = 2;
- int64 num_bytes = 3;
- repeated ColumnStats column_stats = 4;
-}
-
-message ColumnStats {
- ScalarValue min_value = 1;
- ScalarValue max_value = 2;
- uint32 null_count = 3;
- uint32 distinct_count = 4;
-}
-
-message ExecutorMetadata {
- string id = 1;
- string host = 2;
- uint32 port = 3;
-}
-
-message GetExecutorMetadataParams {}
-
-message GetExecutorMetadataResult {
- repeated ExecutorMetadata metadata = 1;
-}
-
-message RunningTask {
- string executor_id = 1;
-}
-
-message FailedTask {
- string error = 1;
-}
-
-message CompletedTask {
- string executor_id = 1;
-}
-
-message TaskStatus {
- PartitionId partition_id = 1;
- oneof status {
- RunningTask running = 2;
- FailedTask failed = 3;
- CompletedTask completed = 4;
- }
-}
-
-message PollWorkParams {
- ExecutorMetadata metadata = 1;
- bool can_accept_task = 2;
- // All tasks must be reported until they reach the failed or completed state
- repeated TaskStatus task_status = 3;
-}
-
-message TaskDefinition {
- PartitionId task_id = 1;
- PhysicalPlanNode plan = 2;
-}
-
-message PollWorkResult {
- TaskDefinition task = 1;
-}
-
-message ExecuteQueryParams {
- oneof query {
- LogicalPlanNode logical_plan = 1;
- string sql = 2;
- }}
-
-message ExecuteSqlParams {
- string sql = 1;
-}
-
-message ExecuteQueryResult {
- string job_id = 1;
-}
-
-message GetJobStatusParams {
- string job_id = 1;
-}
-
-message CompletedJob {
- repeated PartitionLocation partition_location = 1;
-}
-
-message QueuedJob {}
-
-// TODO: add progress report
-message RunningJob {}
-
-message FailedJob {
- string error = 1;
-}
-
-message JobStatus {
- oneof status {
- QueuedJob queued = 1;
- RunningJob running = 2;
- FailedJob failed = 3;
- CompletedJob completed = 4;
- }
-}
-
-message GetJobStatusResult {
- JobStatus status = 1;
-}
-
-message GetFileMetadataParams {
- string path = 1;
- FileType file_type = 2;
-}
-
-message GetFileMetadataResult {
- Schema schema = 1;
- repeated FilePartitionMetadata partitions = 2;
-}
-
-message FilePartitionMetadata {
- repeated string filename = 1;
-}
-
-service SchedulerGrpc {
- rpc GetExecutorsMetadata (GetExecutorMetadataParams) returns (GetExecutorMetadataResult) {}
-
- // Executors must poll the scheduler for heartbeat and to receive tasks
- rpc PollWork (PollWorkParams) returns (PollWorkResult) {}
-
- rpc GetFileMetadata (GetFileMetadataParams) returns (GetFileMetadataResult) {}
-
- rpc ExecuteQuery (ExecuteQueryParams) returns (ExecuteQueryResult) {}
-
- rpc GetJobStatus (GetJobStatusParams) returns (GetJobStatusResult) {}
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Arrow Data Types
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-message Schema {
- repeated Field columns = 1;
-}
-
-message Field {
- // name of the field
- string name = 1;
- ArrowType arrow_type = 2;
- bool nullable = 3;
- // for complex data types like structs, unions
- repeated Field children = 4;
-}
-
-message FixedSizeBinary{
- int32 length = 1;
-}
-
-message Timestamp{
- TimeUnit time_unit = 1;
- string timezone = 2;
-}
-
-enum DateUnit{
- Day = 0;
- DateMillisecond = 1;
-}
-
-enum TimeUnit{
- Second = 0;
- TimeMillisecond = 1;
- Microsecond = 2;
- Nanosecond = 3;
-}
-
-enum IntervalUnit{
- YearMonth = 0;
- DayTime = 1;
-}
-
-message Decimal{
- uint64 whole = 1;
- uint64 fractional = 2;
-}
-
-message List{
- Field field_type = 1;
-}
-
-message FixedSizeList{
- Field field_type = 1;
- int32 list_size = 2;
-}
-
-message Dictionary{
- ArrowType key = 1;
- ArrowType value = 2;
-}
-
-message Struct{
- repeated Field sub_field_types = 1;
-}
-
-message Union{
- repeated Field union_types = 1;
-}
-
-
-message ScalarListValue{
- ScalarType datatype = 1;
- repeated ScalarValue values = 2;
-}
-
-
-
-message ScalarValue{
- oneof value{
- bool bool_value = 1;
- string utf8_value = 2;
- string large_utf8_value = 3;
- int32 int8_value = 4;
- int32 int16_value = 5;
- int32 int32_value = 6;
- int64 int64_value = 7;
- uint32 uint8_value = 8;
- uint32 uint16_value = 9;
- uint32 uint32_value = 10;
- uint64 uint64_value = 11;
- float float32_value = 12;
- double float64_value = 13;
- //Literal Date32 value always has a unit of day
- int32 date_32_value = 14;
- int64 time_microsecond_value = 15;
- int64 time_nanosecond_value = 16;
- ScalarListValue list_value = 17;
- ScalarType null_list_value = 18;
-
- PrimitiveScalarType null_value = 19;
- }
-}
-
-// Contains all valid datafusion scalar type except for
-// List
-enum PrimitiveScalarType{
-
- BOOL = 0; // arrow::Type::BOOL
- UINT8 = 1; // arrow::Type::UINT8
- INT8 = 2; // arrow::Type::INT8
- UINT16 = 3; // represents arrow::Type fields in src/arrow/type.h
- INT16 = 4;
- UINT32 = 5;
- INT32 = 6;
- UINT64 = 7;
- INT64 = 8;
- FLOAT32 = 9;
- FLOAT64 = 10;
- UTF8 = 11;
- LARGE_UTF8 = 12;
- DATE32 = 13;
- TIME_MICROSECOND = 14;
- TIME_NANOSECOND = 15;
- NULL = 16;
-}
-
-message ScalarType{
- oneof datatype{
- PrimitiveScalarType scalar = 1;
- ScalarListType list = 2;
- }
-}
-
-message ScalarListType{
- repeated string field_names = 3;
- PrimitiveScalarType deepest_type = 2;
-}
-
-// Broke out into multiple message types so that type
-// metadata did not need to be in separate message
-//All types that are of the empty message types contain no additional metadata
-// about the type
-message ArrowType{
- oneof arrow_type_enum{
- EmptyMessage NONE = 1; // arrow::Type::NA
- EmptyMessage BOOL = 2; // arrow::Type::BOOL
- EmptyMessage UINT8 = 3; // arrow::Type::UINT8
- EmptyMessage INT8 = 4; // arrow::Type::INT8
- EmptyMessage UINT16 =5; // represents arrow::Type fields in src/arrow/type.h
- EmptyMessage INT16 = 6;
- EmptyMessage UINT32 =7;
- EmptyMessage INT32 = 8;
- EmptyMessage UINT64 =9;
- EmptyMessage INT64 =10 ;
- EmptyMessage FLOAT16 =11 ;
- EmptyMessage FLOAT32 =12 ;
- EmptyMessage FLOAT64 =13 ;
- EmptyMessage UTF8 =14 ;
- EmptyMessage LARGE_UTF8 = 32;
- EmptyMessage BINARY =15 ;
- int32 FIXED_SIZE_BINARY =16 ;
- EmptyMessage LARGE_BINARY = 31;
- EmptyMessage DATE32 =17 ;
- EmptyMessage DATE64 =18 ;
- TimeUnit DURATION = 19;
- Timestamp TIMESTAMP =20 ;
- TimeUnit TIME32 =21 ;
- TimeUnit TIME64 =22 ;
- IntervalUnit INTERVAL =23 ;
- Decimal DECIMAL =24 ;
- List LIST =25;
- List LARGE_LIST = 26;
- FixedSizeList FIXED_SIZE_LIST = 27;
- Struct STRUCT =28;
- Union UNION =29;
- Dictionary DICTIONARY =30;
- }
-}
-
-
-
-
-
-//Useful for representing an empty enum variant in rust
-// E.G. enum example{One, Two(i32)}
-// maps to
-// message example{
-// oneof{
-// EmptyMessage One = 1;
-// i32 Two = 2;
-// }
-//}
-message EmptyMessage{}
diff --git a/rust/ballista/rust/core/src/client.rs b/rust/ballista/rust/core/src/client.rs
deleted file mode 100644
index f64f95f..0000000
--- a/rust/ballista/rust/core/src/client.rs
+++ /dev/null
@@ -1,224 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Client API for sending requests to executors.
-
-use std::sync::Arc;
-use std::{collections::HashMap, pin::Pin};
-use std::{
- convert::{TryFrom, TryInto},
- task::{Context, Poll},
-};
-
-use crate::error::{ballista_error, BallistaError, Result};
-use crate::memory_stream::MemoryStream;
-use crate::serde::protobuf::{self};
-use crate::serde::scheduler::{
- Action, ExecutePartition, ExecutePartitionResult, PartitionId, PartitionStats,
-};
-
-use arrow::record_batch::RecordBatch;
-use arrow::{
- array::{StringArray, StructArray},
- error::{ArrowError, Result as ArrowResult},
-};
-use arrow::{datatypes::Schema, datatypes::SchemaRef};
-use arrow_flight::utils::flight_data_to_arrow_batch;
-use arrow_flight::Ticket;
-use arrow_flight::{flight_service_client::FlightServiceClient, FlightData};
-use datafusion::physical_plan::common::collect;
-use datafusion::physical_plan::{ExecutionPlan, SendableRecordBatchStream};
-use datafusion::{logical_plan::LogicalPlan, physical_plan::RecordBatchStream};
-use futures::{Stream, StreamExt};
-use log::debug;
-use prost::Message;
-use tonic::Streaming;
-use uuid::Uuid;
-
-/// Client for interacting with Ballista executors.
-#[derive(Clone)]
-pub struct BallistaClient {
- flight_client: FlightServiceClient<tonic::transport::channel::Channel>,
-}
-
-impl BallistaClient {
- /// Create a new BallistaClient to connect to the executor listening on the specified
- /// host and port
-
- pub async fn try_new(host: &str, port: u16) -> Result<Self> {
- let addr = format!("http://{}:{}", host, port);
- debug!("BallistaClient connecting to {}", addr);
- let flight_client =
- FlightServiceClient::connect(addr.clone())
- .await
- .map_err(|e| {
- BallistaError::General(format!(
- "Error connecting to Ballista scheduler or executor at {}: {:?}",
- addr, e
- ))
- })?;
- debug!("BallistaClient connected OK");
-
- Ok(Self { flight_client })
- }
-
- /// Execute one partition of a physical query plan against the executor
- pub async fn execute_partition(
- &mut self,
- job_id: String,
- stage_id: usize,
- partition_id: Vec<usize>,
- plan: Arc<dyn ExecutionPlan>,
- ) -> Result<Vec<ExecutePartitionResult>> {
- let action = Action::ExecutePartition(ExecutePartition {
- job_id,
- stage_id,
- partition_id,
- plan,
- shuffle_locations: Default::default(),
- });
- let stream = self.execute_action(&action).await?;
- let batches = collect(stream).await?;
-
- batches
- .iter()
- .map(|batch| {
- if batch.num_rows() != 1 {
- Err(BallistaError::General(
- "execute_partition received wrong number of rows".to_owned(),
- ))
- } else {
- let path = batch
- .column(0)
- .as_any()
- .downcast_ref::<StringArray>()
- .expect(
- "execute_partition expected column 0 to be a StringArray",
- );
-
- let stats = batch
- .column(1)
- .as_any()
- .downcast_ref::<StructArray>()
- .expect(
- "execute_partition expected column 1 to be a StructArray",
- );
-
- Ok(ExecutePartitionResult::new(
- path.value(0),
- PartitionStats::from_arrow_struct_array(stats),
- ))
- }
- })
- .collect::<Result<Vec<_>>>()
- }
-
- /// Fetch a partition from an executor
- pub async fn fetch_partition(
- &mut self,
- job_id: &str,
- stage_id: usize,
- partition_id: usize,
- ) -> Result<SendableRecordBatchStream> {
- let action =
- Action::FetchPartition(PartitionId::new(job_id, stage_id, partition_id));
- self.execute_action(&action).await
- }
-
- /// Execute an action and retrieve the results
- pub async fn execute_action(
- &mut self,
- action: &Action,
- ) -> Result<SendableRecordBatchStream> {
- let serialized_action: protobuf::Action = action.to_owned().try_into()?;
-
- let mut buf: Vec<u8> = Vec::with_capacity(serialized_action.encoded_len());
-
- serialized_action
- .encode(&mut buf)
- .map_err(|e| BallistaError::General(format!("{:?}", e)))?;
-
- let request = tonic::Request::new(Ticket { ticket: buf });
-
- let mut stream = self
- .flight_client
- .do_get(request)
- .await
- .map_err(|e| BallistaError::General(format!("{:?}", e)))?
- .into_inner();
-
- // the schema should be the first message returned, else client should error
- match stream
- .message()
- .await
- .map_err(|e| BallistaError::General(format!("{:?}", e)))?
- {
- Some(flight_data) => {
- // convert FlightData to a stream
- let schema = Arc::new(Schema::try_from(&flight_data)?);
-
- // all the remaining stream messages should be dictionary and record batches
- Ok(Box::pin(FlightDataStream::new(stream, schema)))
- }
- None => Err(ballista_error(
- "Did not receive schema batch from flight server",
- )),
- }
- }
-}
-
-struct FlightDataStream {
- stream: Streaming<FlightData>,
- schema: SchemaRef,
-}
-
-impl FlightDataStream {
- pub fn new(stream: Streaming<FlightData>, schema: SchemaRef) -> Self {
- Self { stream, schema }
- }
-}
-
-impl Stream for FlightDataStream {
- type Item = ArrowResult<RecordBatch>;
-
- fn poll_next(
- mut self: std::pin::Pin<&mut Self>,
- cx: &mut Context<'_>,
- ) -> Poll<Option<Self::Item>> {
- self.stream.poll_next_unpin(cx).map(|x| match x {
- Some(flight_data_chunk_result) => {
- let converted_chunk = flight_data_chunk_result
- .map_err(|e| ArrowError::from_external_error(Box::new(e)))
- .and_then(|flight_data_chunk| {
- flight_data_to_arrow_batch(
- &flight_data_chunk,
- self.schema.clone(),
- &[],
- )
- });
- Some(converted_chunk)
- }
- None => None,
- })
- }
-}
-
-impl RecordBatchStream for FlightDataStream {
- fn schema(&self) -> SchemaRef {
- self.schema.clone()
- }
-}
diff --git a/rust/ballista/rust/core/src/datasource.rs b/rust/ballista/rust/core/src/datasource.rs
deleted file mode 100644
index 8ff0df4..0000000
--- a/rust/ballista/rust/core/src/datasource.rs
+++ /dev/null
@@ -1,72 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::{any::Any, sync::Arc};
-
-use arrow::datatypes::SchemaRef;
-use datafusion::error::Result as DFResult;
-use datafusion::{
- datasource::{datasource::Statistics, TableProvider},
- logical_plan::{Expr, LogicalPlan},
- physical_plan::ExecutionPlan,
-};
-
-/// This ugly adapter is needed because we use DataFusion's logical plan when building queries
-/// and when we register tables with DataFusion's `ExecutionContext` we need to provide a
-/// TableProvider which is effectively a wrapper around a physical plan. We need to be able to
-/// register tables so that we can create logical plans from SQL statements that reference these
-/// tables.
-pub struct DFTableAdapter {
- /// DataFusion logical plan
- pub logical_plan: LogicalPlan,
- /// DataFusion execution plan
- plan: Arc<dyn ExecutionPlan>,
-}
-
-impl DFTableAdapter {
- pub fn new(logical_plan: LogicalPlan, plan: Arc<dyn ExecutionPlan>) -> Self {
- Self { logical_plan, plan }
- }
-}
-
-impl TableProvider for DFTableAdapter {
- fn as_any(&self) -> &dyn Any {
- self
- }
-
- fn schema(&self) -> SchemaRef {
- self.plan.schema()
- }
-
- fn scan(
- &self,
- _projection: &Option<Vec<usize>>,
- _batch_size: usize,
- _filters: &[Expr],
- _limit: Option<usize>,
- ) -> DFResult<Arc<dyn ExecutionPlan>> {
- Ok(self.plan.clone())
- }
-
- fn statistics(&self) -> Statistics {
- Statistics {
- num_rows: None,
- total_byte_size: None,
- column_statistics: None,
- }
- }
-}
diff --git a/rust/ballista/rust/core/src/error.rs b/rust/ballista/rust/core/src/error.rs
deleted file mode 100644
index d0155ce..0000000
--- a/rust/ballista/rust/core/src/error.rs
+++ /dev/null
@@ -1,172 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Ballista error types
-
-use std::{
- error::Error,
- fmt::{Display, Formatter},
- io, result,
-};
-
-use arrow::error::ArrowError;
-use datafusion::error::DataFusionError;
-use sqlparser::parser;
-
-pub type Result<T> = result::Result<T, BallistaError>;
-
-/// Ballista error
-#[derive(Debug)]
-pub enum BallistaError {
- NotImplemented(String),
- General(String),
- Internal(String),
- ArrowError(ArrowError),
- DataFusionError(DataFusionError),
- SqlError(parser::ParserError),
- IoError(io::Error),
- // ReqwestError(reqwest::Error),
- //HttpError(http::Error),
- // KubeAPIError(kube::error::Error),
- // KubeAPIRequestError(k8s_openapi::RequestError),
- // KubeAPIResponseError(k8s_openapi::ResponseError),
- TonicError(tonic::transport::Error),
- GrpcError(tonic::Status),
- TokioError(tokio::task::JoinError),
-}
-
-impl<T> Into<Result<T>> for BallistaError {
- fn into(self) -> Result<T> {
- Err(self)
- }
-}
-
-pub fn ballista_error(message: &str) -> BallistaError {
- BallistaError::General(message.to_owned())
-}
-
-impl From<String> for BallistaError {
- fn from(e: String) -> Self {
- BallistaError::General(e)
- }
-}
-
-impl From<ArrowError> for BallistaError {
- fn from(e: ArrowError) -> Self {
- BallistaError::ArrowError(e)
- }
-}
-
-impl From<parser::ParserError> for BallistaError {
- fn from(e: parser::ParserError) -> Self {
- BallistaError::SqlError(e)
- }
-}
-
-impl From<DataFusionError> for BallistaError {
- fn from(e: DataFusionError) -> Self {
- BallistaError::DataFusionError(e)
- }
-}
-
-impl From<io::Error> for BallistaError {
- fn from(e: io::Error) -> Self {
- BallistaError::IoError(e)
- }
-}
-
-// impl From<reqwest::Error> for BallistaError {
-// fn from(e: reqwest::Error) -> Self {
-// BallistaError::ReqwestError(e)
-// }
-// }
-//
-// impl From<http::Error> for BallistaError {
-// fn from(e: http::Error) -> Self {
-// BallistaError::HttpError(e)
-// }
-// }
-
-// impl From<kube::error::Error> for BallistaError {
-// fn from(e: kube::error::Error) -> Self {
-// BallistaError::KubeAPIError(e)
-// }
-// }
-
-// impl From<k8s_openapi::RequestError> for BallistaError {
-// fn from(e: k8s_openapi::RequestError) -> Self {
-// BallistaError::KubeAPIRequestError(e)
-// }
-// }
-
-// impl From<k8s_openapi::ResponseError> for BallistaError {
-// fn from(e: k8s_openapi::ResponseError) -> Self {
-// BallistaError::KubeAPIResponseError(e)
-// }
-// }
-
-impl From<tonic::transport::Error> for BallistaError {
- fn from(e: tonic::transport::Error) -> Self {
- BallistaError::TonicError(e)
- }
-}
-
-impl From<tonic::Status> for BallistaError {
- fn from(e: tonic::Status) -> Self {
- BallistaError::GrpcError(e)
- }
-}
-
-impl From<tokio::task::JoinError> for BallistaError {
- fn from(e: tokio::task::JoinError) -> Self {
- BallistaError::TokioError(e)
- }
-}
-
-impl Display for BallistaError {
- fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
- match self {
- BallistaError::NotImplemented(ref desc) => {
- write!(f, "Not implemented: {}", desc)
- }
- BallistaError::General(ref desc) => write!(f, "General error: {}", desc),
- BallistaError::ArrowError(ref desc) => write!(f, "Arrow error: {}", desc),
- BallistaError::DataFusionError(ref desc) => {
- write!(f, "DataFusion error: {:?}", desc)
- }
- BallistaError::SqlError(ref desc) => write!(f, "SQL error: {:?}", desc),
- BallistaError::IoError(ref desc) => write!(f, "IO error: {}", desc),
- // BallistaError::ReqwestError(ref desc) => write!(f, "Reqwest error: {}", desc),
- // BallistaError::HttpError(ref desc) => write!(f, "HTTP error: {}", desc),
- // BallistaError::KubeAPIError(ref desc) => write!(f, "Kube API error: {}", desc),
- // BallistaError::KubeAPIRequestError(ref desc) => {
- // write!(f, "KubeAPI request error: {}", desc)
- // }
- // BallistaError::KubeAPIResponseError(ref desc) => {
- // write!(f, "KubeAPI response error: {}", desc)
- // }
- BallistaError::TonicError(desc) => write!(f, "Tonic error: {}", desc),
- BallistaError::GrpcError(desc) => write!(f, "Grpc error: {}", desc),
- BallistaError::Internal(desc) => {
- write!(f, "Internal Ballista error: {}", desc)
- }
- BallistaError::TokioError(desc) => write!(f, "Tokio join error: {}", desc),
- }
- }
-}
-
-impl Error for BallistaError {}
diff --git a/rust/ballista/rust/core/src/execution_plans/mod.rs b/rust/ballista/rust/core/src/execution_plans/mod.rs
deleted file mode 100644
index 1fb2010..0000000
--- a/rust/ballista/rust/core/src/execution_plans/mod.rs
+++ /dev/null
@@ -1,27 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! This module contains execution plans that are needed to distribute Datafusion's execution plans into
-//! several Ballista executors.
-
-mod query_stage;
-mod shuffle_reader;
-mod unresolved_shuffle;
-
-pub use query_stage::QueryStageExec;
-pub use shuffle_reader::ShuffleReaderExec;
-pub use unresolved_shuffle::UnresolvedShuffleExec;
diff --git a/rust/ballista/rust/core/src/execution_plans/query_stage.rs b/rust/ballista/rust/core/src/execution_plans/query_stage.rs
deleted file mode 100644
index d8822ea..0000000
--- a/rust/ballista/rust/core/src/execution_plans/query_stage.rs
+++ /dev/null
@@ -1,92 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::sync::Arc;
-use std::{any::Any, pin::Pin};
-
-use arrow::datatypes::SchemaRef;
-use async_trait::async_trait;
-use datafusion::physical_plan::{ExecutionPlan, Partitioning};
-use datafusion::{error::Result, physical_plan::RecordBatchStream};
-use uuid::Uuid;
-
-/// QueryStageExec represents a section of a query plan that has consistent partitioning and
-/// can be executed as one unit with each partition being executed in parallel. The output of
-/// a query stage either forms the input of another query stage or can be the final result of
-/// a query.
-#[derive(Debug, Clone)]
-pub struct QueryStageExec {
- /// Unique ID for the job (query) that this stage is a part of
- pub job_id: String,
- /// Unique query stage ID within the job
- pub stage_id: usize,
- /// Physical execution plan for this query stage
- pub child: Arc<dyn ExecutionPlan>,
-}
-
-impl QueryStageExec {
- /// Create a new query stage
- pub fn try_new(
- job_id: String,
- stage_id: usize,
- child: Arc<dyn ExecutionPlan>,
- ) -> Result<Self> {
- Ok(Self {
- job_id,
- stage_id,
- child,
- })
- }
-}
-
-#[async_trait]
-impl ExecutionPlan for QueryStageExec {
- fn as_any(&self) -> &dyn Any {
- self
- }
-
- fn schema(&self) -> SchemaRef {
- self.child.schema()
- }
-
- fn output_partitioning(&self) -> Partitioning {
- self.child.output_partitioning()
- }
-
- fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
- vec![self.child.clone()]
- }
-
- fn with_new_children(
- &self,
- children: Vec<Arc<dyn ExecutionPlan>>,
- ) -> Result<Arc<dyn ExecutionPlan>> {
- assert!(children.len() == 1);
- Ok(Arc::new(QueryStageExec::try_new(
- self.job_id.clone(),
- self.stage_id,
- children[0].clone(),
- )?))
- }
-
- async fn execute(
- &self,
- partition: usize,
- ) -> Result<Pin<Box<dyn RecordBatchStream + Send + Sync>>> {
- self.child.execute(partition).await
- }
-}
diff --git a/rust/ballista/rust/core/src/execution_plans/shuffle_reader.rs b/rust/ballista/rust/core/src/execution_plans/shuffle_reader.rs
deleted file mode 100644
index bd8f6fd..0000000
--- a/rust/ballista/rust/core/src/execution_plans/shuffle_reader.rs
+++ /dev/null
@@ -1,106 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::sync::Arc;
-use std::{any::Any, pin::Pin};
-
-use crate::client::BallistaClient;
-use crate::memory_stream::MemoryStream;
-use crate::serde::scheduler::PartitionLocation;
-
-use arrow::datatypes::SchemaRef;
-use async_trait::async_trait;
-use datafusion::physical_plan::{ExecutionPlan, Partitioning};
-use datafusion::{
- error::{DataFusionError, Result},
- physical_plan::RecordBatchStream,
-};
-use log::info;
-
-/// ShuffleReaderExec reads partitions that have already been materialized by an executor.
-#[derive(Debug, Clone)]
-pub struct ShuffleReaderExec {
- // The query stage that is responsible for producing the shuffle partitions that
- // this operator will read
- pub(crate) partition_location: Vec<PartitionLocation>,
- pub(crate) schema: SchemaRef,
-}
-
-impl ShuffleReaderExec {
- /// Create a new ShuffleReaderExec
- pub fn try_new(
- partition_meta: Vec<PartitionLocation>,
- schema: SchemaRef,
- ) -> Result<Self> {
- Ok(Self {
- partition_location: partition_meta,
- schema,
- })
- }
-}
-
-#[async_trait]
-impl ExecutionPlan for ShuffleReaderExec {
- fn as_any(&self) -> &dyn Any {
- self
- }
-
- fn schema(&self) -> SchemaRef {
- self.schema.clone()
- }
-
- fn output_partitioning(&self) -> Partitioning {
- Partitioning::UnknownPartitioning(self.partition_location.len())
- }
-
- fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
- vec![]
- }
-
- fn with_new_children(
- &self,
- _children: Vec<Arc<dyn ExecutionPlan>>,
- ) -> Result<Arc<dyn ExecutionPlan>> {
- Err(DataFusionError::Plan(
- "Ballista ShuffleReaderExec does not support with_new_children()".to_owned(),
- ))
- }
-
- async fn execute(
- &self,
- partition: usize,
- ) -> Result<Pin<Box<dyn RecordBatchStream + Send + Sync>>> {
- info!("ShuffleReaderExec::execute({})", partition);
- let partition_location = &self.partition_location[partition];
-
- let mut client = BallistaClient::try_new(
- &partition_location.executor_meta.host,
- partition_location.executor_meta.port,
- )
- .await
- .map_err(|e| DataFusionError::Execution(format!("Ballista Error: {:?}", e)))?;
-
- client
- .fetch_partition(
- &partition_location.partition_id.job_id,
- partition_location.partition_id.stage_id,
- partition,
- )
- .await
- .map_err(|e| DataFusionError::Execution(format!("Ballista Error: {:?}", e)))
- }
-}
diff --git a/rust/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs b/rust/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs
deleted file mode 100644
index a62a251..0000000
--- a/rust/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs
+++ /dev/null
@@ -1,101 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::sync::Arc;
-use std::{any::Any, pin::Pin};
-
-use crate::client::BallistaClient;
-use crate::memory_stream::MemoryStream;
-use crate::serde::scheduler::PartitionLocation;
-
-use arrow::datatypes::SchemaRef;
-use async_trait::async_trait;
-use datafusion::physical_plan::{ExecutionPlan, Partitioning};
-use datafusion::{
- error::{DataFusionError, Result},
- physical_plan::RecordBatchStream,
-};
-use log::info;
-
-/// UnresolvedShuffleExec represents a dependency on the results of several QueryStageExec nodes which haven't been computed yet.
-///
-/// An ExecutionPlan that contains an UnresolvedShuffleExec isn't ready for execution. The presence of this ExecutionPlan
-/// is used as a signal so the scheduler knows it can't start computation on a specific QueryStageExec.
-#[derive(Debug, Clone)]
-pub struct UnresolvedShuffleExec {
- // The query stage ids which needs to be computed
- pub query_stage_ids: Vec<usize>,
-
- // The schema this node will have once it is replaced with a ShuffleReaderExec
- pub schema: SchemaRef,
-
- // The partition count this node will have once it is replaced with a ShuffleReaderExec
- pub partition_count: usize,
-}
-
-impl UnresolvedShuffleExec {
- /// Create a new UnresolvedShuffleExec
- pub fn new(
- query_stage_ids: Vec<usize>,
- schema: SchemaRef,
- partition_count: usize,
- ) -> Self {
- Self {
- query_stage_ids,
- schema,
- partition_count,
- }
- }
-}
-
-#[async_trait]
-impl ExecutionPlan for UnresolvedShuffleExec {
- fn as_any(&self) -> &dyn Any {
- self
- }
-
- fn schema(&self) -> SchemaRef {
- self.schema.clone()
- }
-
- fn output_partitioning(&self) -> Partitioning {
- Partitioning::UnknownPartitioning(self.partition_count)
- }
-
- fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
- vec![]
- }
-
- fn with_new_children(
- &self,
- _children: Vec<Arc<dyn ExecutionPlan>>,
- ) -> Result<Arc<dyn ExecutionPlan>> {
- Err(DataFusionError::Plan(
- "Ballista UnresolvedShuffleExec does not support with_new_children()"
- .to_owned(),
- ))
- }
-
- async fn execute(
- &self,
- _partition: usize,
- ) -> Result<Pin<Box<dyn RecordBatchStream + Send + Sync>>> {
- Err(DataFusionError::Plan(
- "Ballista UnresolvedShuffleExec does not support execution".to_owned(),
- ))
- }
-}
diff --git a/rust/ballista/rust/core/src/lib.rs b/rust/ballista/rust/core/src/lib.rs
deleted file mode 100644
index 425dbab..0000000
--- a/rust/ballista/rust/core/src/lib.rs
+++ /dev/null
@@ -1,34 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Ballista Distributed Compute
-#![allow(unused_imports)]
-pub const BALLISTA_VERSION: &str = env!("CARGO_PKG_VERSION");
-
-pub fn print_version() {
- println!("Ballista version: {}", BALLISTA_VERSION)
-}
-
-pub mod client;
-pub mod datasource;
-pub mod error;
-pub mod execution_plans;
-pub mod memory_stream;
-pub mod utils;
-
-#[macro_use]
-pub mod serde;
diff --git a/rust/ballista/rust/core/src/memory_stream.rs b/rust/ballista/rust/core/src/memory_stream.rs
deleted file mode 100644
index 8bf5e20..0000000
--- a/rust/ballista/rust/core/src/memory_stream.rs
+++ /dev/null
@@ -1,93 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! This is copied from DataFusion because it is declared as `pub(crate)`. See
-//! https://issues.apache.org/jira/browse/ARROW-11276.
-
-use std::task::{Context, Poll};
-
-use arrow::{datatypes::SchemaRef, error::Result, record_batch::RecordBatch};
-use datafusion::physical_plan::RecordBatchStream;
-use futures::Stream;
-
-/// Iterator over batches
-
-pub struct MemoryStream {
- /// Vector of record batches
- data: Vec<RecordBatch>,
- /// Schema representing the data
- schema: SchemaRef,
- /// Optional projection for which columns to load
- projection: Option<Vec<usize>>,
- /// Index into the data
- index: usize,
-}
-
-impl MemoryStream {
- /// Create an iterator for a vector of record batches
-
- pub fn try_new(
- data: Vec<RecordBatch>,
- schema: SchemaRef,
- projection: Option<Vec<usize>>,
- ) -> Result<Self> {
- Ok(Self {
- data,
- schema,
- projection,
- index: 0,
- })
- }
-}
-
-impl Stream for MemoryStream {
- type Item = Result<RecordBatch>;
-
- fn poll_next(
- mut self: std::pin::Pin<&mut Self>,
- _: &mut Context<'_>,
- ) -> Poll<Option<Self::Item>> {
- Poll::Ready(if self.index < self.data.len() {
- self.index += 1;
-
- let batch = &self.data[self.index - 1];
-
- // apply projection
- match &self.projection {
- Some(columns) => Some(RecordBatch::try_new(
- self.schema.clone(),
- columns.iter().map(|i| batch.column(*i).clone()).collect(),
- )),
- None => Some(Ok(batch.clone())),
- }
- } else {
- None
- })
- }
-
- fn size_hint(&self) -> (usize, Option<usize>) {
- (self.data.len(), Some(self.data.len()))
- }
-}
-
-impl RecordBatchStream for MemoryStream {
- /// Get the schema
-
- fn schema(&self) -> SchemaRef {
- self.schema.clone()
- }
-}
diff --git a/rust/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/rust/ballista/rust/core/src/serde/logical_plan/from_proto.rs
deleted file mode 100644
index 9308426..0000000
--- a/rust/ballista/rust/core/src/serde/logical_plan/from_proto.rs
+++ /dev/null
@@ -1,1200 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Serde code to convert from protocol buffers to Rust data structures.
-
-use std::{
- convert::{From, TryInto},
- unimplemented,
-};
-
-use crate::error::BallistaError;
-use crate::serde::{proto_error, protobuf};
-use crate::{convert_box_required, convert_required};
-
-use arrow::datatypes::{DataType, Field, Schema};
-use datafusion::logical_plan::{
- abs, acos, asin, atan, ceil, cos, exp, floor, log10, log2, round, signum, sin, sqrt,
- tan, trunc, Expr, JoinType, LogicalPlan, LogicalPlanBuilder, Operator,
-};
-use datafusion::physical_plan::aggregates::AggregateFunction;
-use datafusion::physical_plan::csv::CsvReadOptions;
-use datafusion::scalar::ScalarValue;
-use protobuf::logical_plan_node::LogicalPlanType;
-use protobuf::{logical_expr_node::ExprType, scalar_type};
-
-// use uuid::Uuid;
-
-impl TryInto<LogicalPlan> for &protobuf::LogicalPlanNode {
- type Error = BallistaError;
-
- fn try_into(self) -> Result<LogicalPlan, Self::Error> {
- let plan = self.logical_plan_type.as_ref().ok_or_else(|| {
- proto_error(format!(
- "logical_plan::from_proto() Unsupported logical plan '{:?}'",
- self
- ))
- })?;
- match plan {
- LogicalPlanType::Projection(projection) => {
- let input: LogicalPlan = convert_box_required!(projection.input)?;
- let x: Vec<Expr> = projection
- .expr
- .iter()
- .map(|expr| expr.try_into())
- .collect::<Result<Vec<_>, _>>()?;
- LogicalPlanBuilder::from(&input)
- .project(x)?
- .build()
- .map_err(|e| e.into())
- }
- LogicalPlanType::Selection(selection) => {
- let input: LogicalPlan = convert_box_required!(selection.input)?;
- LogicalPlanBuilder::from(&input)
- .filter(
- selection
- .expr
- .as_ref()
- .expect("expression required")
- .try_into()?,
- )?
- .build()
- .map_err(|e| e.into())
- }
- LogicalPlanType::Aggregate(aggregate) => {
- let input: LogicalPlan = convert_box_required!(aggregate.input)?;
- let group_expr = aggregate
- .group_expr
- .iter()
- .map(|expr| expr.try_into())
- .collect::<Result<Vec<_>, _>>()?;
- let aggr_expr = aggregate
- .aggr_expr
- .iter()
- .map(|expr| expr.try_into())
- .collect::<Result<Vec<_>, _>>()?;
- LogicalPlanBuilder::from(&input)
- .aggregate(group_expr, aggr_expr)?
- .build()
- .map_err(|e| e.into())
- }
- LogicalPlanType::CsvScan(scan) => {
- let schema: Schema = convert_required!(scan.schema)?;
- let options = CsvReadOptions::new()
- .schema(&schema)
- .delimiter(scan.delimiter.as_bytes()[0])
- .file_extension(&scan.file_extension)
- .has_header(scan.has_header);
-
- let mut projection = None;
- if let Some(column_names) = &scan.projection {
- let column_indices = column_names
- .columns
- .iter()
- .map(|name| schema.index_of(name))
- .collect::<Result<Vec<usize>, _>>()?;
- projection = Some(column_indices);
- }
-
- LogicalPlanBuilder::scan_csv(&scan.path, options, projection)?
- .build()
- .map_err(|e| e.into())
- }
- LogicalPlanType::ParquetScan(scan) => {
- let projection = match scan.projection.as_ref() {
- None => None,
- Some(columns) => {
- let schema: Schema = convert_required!(scan.schema)?;
- let r: Result<Vec<usize>, _> = columns
- .columns
- .iter()
- .map(|col_name| {
- schema.fields().iter().position(|field| field.name() == col_name).ok_or_else(|| {
- let column_names: Vec<&String> = schema.fields().iter().map(|f| f.name()).collect();
- proto_error(format!(
- "Parquet projection contains column name that is not present in schema. Column name: {}. Schema columns: {:?}",
- col_name, column_names
- ))
- })
- })
- .collect();
- Some(r?)
- }
- };
- LogicalPlanBuilder::scan_parquet(&scan.path, projection, 24)? //TODO concurrency
- .build()
- .map_err(|e| e.into())
- }
- LogicalPlanType::Sort(sort) => {
- let input: LogicalPlan = convert_box_required!(sort.input)?;
- let sort_expr: Vec<Expr> = sort
- .expr
- .iter()
- .map(|expr| expr.try_into())
- .collect::<Result<Vec<Expr>, _>>()?;
- LogicalPlanBuilder::from(&input)
- .sort(sort_expr)?
- .build()
- .map_err(|e| e.into())
- }
- LogicalPlanType::Repartition(repartition) => {
- use datafusion::logical_plan::Partitioning;
- let input: LogicalPlan = convert_box_required!(repartition.input)?;
- use protobuf::repartition_node::PartitionMethod;
- let pb_partition_method = repartition.partition_method.clone().ok_or_else(|| {
- BallistaError::General(String::from(
- "Protobuf deserialization error, RepartitionNode was missing required field 'partition_method'",
- ))
- })?;
-
- let partitioning_scheme = match pb_partition_method {
- PartitionMethod::Hash(protobuf::HashRepartition {
- hash_expr: pb_hash_expr,
- partition_count,
- }) => Partitioning::Hash(
- pb_hash_expr
- .iter()
- .map(|pb_expr| pb_expr.try_into())
- .collect::<Result<Vec<_>, _>>()?,
- partition_count as usize,
- ),
- PartitionMethod::RoundRobin(batch_size) => {
- Partitioning::RoundRobinBatch(batch_size as usize)
- }
- };
-
- LogicalPlanBuilder::from(&input)
- .repartition(partitioning_scheme)?
- .build()
- .map_err(|e| e.into())
- }
- LogicalPlanType::EmptyRelation(empty_relation) => {
- LogicalPlanBuilder::empty(empty_relation.produce_one_row)
- .build()
- .map_err(|e| e.into())
- }
- LogicalPlanType::CreateExternalTable(create_extern_table) => {
- let pb_schema = (create_extern_table.schema.clone()).ok_or_else(|| {
- BallistaError::General(String::from(
- "Protobuf deserialization error, CreateExternalTableNode was missing required field schema.",
- ))
- })?;
-
- let pb_file_type: protobuf::FileType =
- create_extern_table.file_type.try_into()?;
-
- Ok(LogicalPlan::CreateExternalTable {
- schema: pb_schema.try_into()?,
- name: create_extern_table.name.clone(),
- location: create_extern_table.location.clone(),
- file_type: pb_file_type.into(),
- has_header: create_extern_table.has_header,
- })
- }
- LogicalPlanType::Explain(explain) => {
- let input: LogicalPlan = convert_box_required!(explain.input)?;
- LogicalPlanBuilder::from(&input)
- .explain(explain.verbose)?
- .build()
- .map_err(|e| e.into())
- }
- LogicalPlanType::Limit(limit) => {
- let input: LogicalPlan = convert_box_required!(limit.input)?;
- LogicalPlanBuilder::from(&input)
- .limit(limit.limit as usize)?
- .build()
- .map_err(|e| e.into())
- }
- LogicalPlanType::Join(join) => {
- let left_keys: Vec<&str> =
- join.left_join_column.iter().map(|i| i.as_str()).collect();
- let right_keys: Vec<&str> =
- join.right_join_column.iter().map(|i| i.as_str()).collect();
- let join_type =
- protobuf::JoinType::from_i32(join.join_type).ok_or_else(|| {
- proto_error(format!(
- "Received a JoinNode message with unknown JoinType {}",
- join.join_type
- ))
- })?;
- let join_type = match join_type {
- protobuf::JoinType::Inner => JoinType::Inner,
- protobuf::JoinType::Left => JoinType::Left,
- protobuf::JoinType::Right => JoinType::Right,
- };
- LogicalPlanBuilder::from(&convert_box_required!(join.left)?)
- .join(
- &convert_box_required!(join.right)?,
- join_type,
- &left_keys,
- &right_keys,
- )?
- .build()
- .map_err(|e| e.into())
- }
- }
- }
-}
-
-impl TryInto<datafusion::logical_plan::DFSchema> for protobuf::Schema {
- type Error = BallistaError;
- fn try_into(self) -> Result<datafusion::logical_plan::DFSchema, Self::Error> {
- let schema: Schema = (&self).try_into()?;
- schema.try_into().map_err(BallistaError::DataFusionError)
- }
-}
-
-impl TryInto<datafusion::logical_plan::DFSchemaRef> for protobuf::Schema {
- type Error = BallistaError;
- fn try_into(self) -> Result<datafusion::logical_plan::DFSchemaRef, Self::Error> {
- use datafusion::logical_plan::ToDFSchema;
- let schema: Schema = (&self).try_into()?;
- schema
- .to_dfschema_ref()
- .map_err(BallistaError::DataFusionError)
- }
-}
-
-impl TryInto<arrow::datatypes::DataType> for &protobuf::scalar_type::Datatype {
- type Error = BallistaError;
- fn try_into(self) -> Result<arrow::datatypes::DataType, Self::Error> {
- use protobuf::scalar_type::Datatype;
- Ok(match self {
- Datatype::Scalar(scalar_type) => {
- let pb_scalar_enum = protobuf::PrimitiveScalarType::from_i32(*scalar_type).ok_or_else(|| {
- proto_error(format!(
- "Protobuf deserialization error, scalar_type::Datatype missing was provided invalid enum variant: {}",
- *scalar_type
- ))
- })?;
- pb_scalar_enum.into()
- }
- Datatype::List(protobuf::ScalarListType {
- deepest_type,
- field_names,
- }) => {
- if field_names.is_empty() {
- return Err(proto_error(
- "Protobuf deserialization error: found no field names in ScalarListType message which requires at least one",
- ));
- }
- let pb_scalar_type = protobuf::PrimitiveScalarType::from_i32(
- *deepest_type,
- )
- .ok_or_else(|| {
- proto_error(format!(
- "Protobuf deserialization error: invalid i32 for scalar enum: {}",
- *deepest_type
- ))
- })?;
- //Because length is checked above it is safe to unwrap .last()
- let mut scalar_type =
- arrow::datatypes::DataType::List(Box::new(Field::new(
- field_names.last().unwrap().as_str(),
- pb_scalar_type.into(),
- true,
- )));
- //Iterate over field names in reverse order except for the last item in the vector
- for name in field_names.iter().rev().skip(1) {
- let new_datatype = arrow::datatypes::DataType::List(Box::new(
- Field::new(name.as_str(), scalar_type, true),
- ));
- scalar_type = new_datatype;
- }
- scalar_type
- }
- })
- }
-}
-
-impl TryInto<arrow::datatypes::DataType> for &protobuf::arrow_type::ArrowTypeEnum {
- type Error = BallistaError;
- fn try_into(self) -> Result<arrow::datatypes::DataType, Self::Error> {
- use arrow::datatypes::DataType;
- use protobuf::arrow_type;
- Ok(match self {
- arrow_type::ArrowTypeEnum::None(_) => DataType::Null,
- arrow_type::ArrowTypeEnum::Bool(_) => DataType::Boolean,
- arrow_type::ArrowTypeEnum::Uint8(_) => DataType::UInt8,
- arrow_type::ArrowTypeEnum::Int8(_) => DataType::Int8,
- arrow_type::ArrowTypeEnum::Uint16(_) => DataType::UInt16,
- arrow_type::ArrowTypeEnum::Int16(_) => DataType::Int16,
- arrow_type::ArrowTypeEnum::Uint32(_) => DataType::UInt32,
- arrow_type::ArrowTypeEnum::Int32(_) => DataType::Int32,
- arrow_type::ArrowTypeEnum::Uint64(_) => DataType::UInt64,
- arrow_type::ArrowTypeEnum::Int64(_) => DataType::Int64,
- arrow_type::ArrowTypeEnum::Float16(_) => DataType::Float16,
- arrow_type::ArrowTypeEnum::Float32(_) => DataType::Float32,
- arrow_type::ArrowTypeEnum::Float64(_) => DataType::Float64,
- arrow_type::ArrowTypeEnum::Utf8(_) => DataType::Utf8,
- arrow_type::ArrowTypeEnum::LargeUtf8(_) => DataType::LargeUtf8,
- arrow_type::ArrowTypeEnum::Binary(_) => DataType::Binary,
- arrow_type::ArrowTypeEnum::FixedSizeBinary(size) => {
- DataType::FixedSizeBinary(*size)
- }
- arrow_type::ArrowTypeEnum::LargeBinary(_) => DataType::LargeBinary,
- arrow_type::ArrowTypeEnum::Date32(_) => DataType::Date32,
- arrow_type::ArrowTypeEnum::Date64(_) => DataType::Date64,
- arrow_type::ArrowTypeEnum::Duration(time_unit) => {
- DataType::Duration(protobuf::TimeUnit::from_i32_to_arrow(*time_unit)?)
- }
- arrow_type::ArrowTypeEnum::Timestamp(protobuf::Timestamp {
- time_unit,
- timezone,
- }) => DataType::Timestamp(
- protobuf::TimeUnit::from_i32_to_arrow(*time_unit)?,
- match timezone.len() {
- 0 => None,
- _ => Some(timezone.to_owned()),
- },
- ),
- arrow_type::ArrowTypeEnum::Time32(time_unit) => {
- DataType::Time32(protobuf::TimeUnit::from_i32_to_arrow(*time_unit)?)
- }
- arrow_type::ArrowTypeEnum::Time64(time_unit) => {
- DataType::Time64(protobuf::TimeUnit::from_i32_to_arrow(*time_unit)?)
- }
- arrow_type::ArrowTypeEnum::Interval(interval_unit) => DataType::Interval(
- protobuf::IntervalUnit::from_i32_to_arrow(*interval_unit)?,
- ),
- arrow_type::ArrowTypeEnum::Decimal(protobuf::Decimal {
- whole,
- fractional,
- }) => DataType::Decimal(*whole as usize, *fractional as usize),
- arrow_type::ArrowTypeEnum::List(list) => {
- let list_type: &protobuf::Field = list
- .as_ref()
- .field_type
- .as_ref()
- .ok_or_else(|| proto_error("Protobuf deserialization error: List message missing required field 'field_type'"))?
- .as_ref();
- DataType::List(Box::new(list_type.try_into()?))
- }
- arrow_type::ArrowTypeEnum::LargeList(list) => {
- let list_type: &protobuf::Field = list
- .as_ref()
- .field_type
- .as_ref()
- .ok_or_else(|| proto_error("Protobuf deserialization error: List message missing required field 'field_type'"))?
- .as_ref();
- DataType::LargeList(Box::new(list_type.try_into()?))
- }
- arrow_type::ArrowTypeEnum::FixedSizeList(list) => {
- let list_type: &protobuf::Field = list
- .as_ref()
- .field_type
- .as_ref()
- .ok_or_else(|| proto_error("Protobuf deserialization error: List message missing required field 'field_type'"))?
- .as_ref();
- let list_size = list.list_size;
- DataType::FixedSizeList(Box::new(list_type.try_into()?), list_size)
- }
- arrow_type::ArrowTypeEnum::Struct(strct) => DataType::Struct(
- strct
- .sub_field_types
- .iter()
- .map(|field| field.try_into())
- .collect::<Result<Vec<_>, _>>()?,
- ),
- arrow_type::ArrowTypeEnum::Union(union) => DataType::Union(
- union
- .union_types
- .iter()
- .map(|field| field.try_into())
- .collect::<Result<Vec<_>, _>>()?,
- ),
- arrow_type::ArrowTypeEnum::Dictionary(dict) => {
- let pb_key_datatype = dict
- .as_ref()
- .key
- .as_ref()
- .ok_or_else(|| proto_error("Protobuf deserialization error: Dictionary message missing required field 'key'"))?;
- let pb_value_datatype = dict
- .as_ref()
- .value
- .as_ref()
- .ok_or_else(|| proto_error("Protobuf deserialization error: Dictionary message missing required field 'key'"))?;
- let key_datatype: DataType = pb_key_datatype.as_ref().try_into()?;
- let value_datatype: DataType = pb_value_datatype.as_ref().try_into()?;
- DataType::Dictionary(Box::new(key_datatype), Box::new(value_datatype))
- }
- })
- }
-}
-
-impl Into<arrow::datatypes::DataType> for protobuf::PrimitiveScalarType {
- fn into(self) -> arrow::datatypes::DataType {
- use arrow::datatypes::DataType;
- match self {
- protobuf::PrimitiveScalarType::Bool => DataType::Boolean,
- protobuf::PrimitiveScalarType::Uint8 => DataType::UInt8,
- protobuf::PrimitiveScalarType::Int8 => DataType::Int8,
- protobuf::PrimitiveScalarType::Uint16 => DataType::UInt16,
- protobuf::PrimitiveScalarType::Int16 => DataType::Int16,
- protobuf::PrimitiveScalarType::Uint32 => DataType::UInt32,
- protobuf::PrimitiveScalarType::Int32 => DataType::Int32,
- protobuf::PrimitiveScalarType::Uint64 => DataType::UInt64,
- protobuf::PrimitiveScalarType::Int64 => DataType::Int64,
- protobuf::PrimitiveScalarType::Float32 => DataType::Float32,
- protobuf::PrimitiveScalarType::Float64 => DataType::Float64,
- protobuf::PrimitiveScalarType::Utf8 => DataType::Utf8,
- protobuf::PrimitiveScalarType::LargeUtf8 => DataType::LargeUtf8,
- protobuf::PrimitiveScalarType::Date32 => DataType::Date32,
- protobuf::PrimitiveScalarType::TimeMicrosecond => {
- DataType::Time64(arrow::datatypes::TimeUnit::Microsecond)
- }
- protobuf::PrimitiveScalarType::TimeNanosecond => {
- DataType::Time64(arrow::datatypes::TimeUnit::Nanosecond)
- }
- protobuf::PrimitiveScalarType::Null => DataType::Null,
- }
- }
-}
-
-//Does not typecheck lists
-fn typechecked_scalar_value_conversion(
- tested_type: &protobuf::scalar_value::Value,
- required_type: protobuf::PrimitiveScalarType,
-) -> Result<datafusion::scalar::ScalarValue, BallistaError> {
- use protobuf::scalar_value::Value;
- use protobuf::PrimitiveScalarType;
- Ok(match (tested_type, &required_type) {
- (Value::BoolValue(v), PrimitiveScalarType::Bool) => {
- ScalarValue::Boolean(Some(*v))
- }
- (Value::Int8Value(v), PrimitiveScalarType::Int8) => {
- ScalarValue::Int8(Some(*v as i8))
- }
- (Value::Int16Value(v), PrimitiveScalarType::Int16) => {
- ScalarValue::Int16(Some(*v as i16))
- }
- (Value::Int32Value(v), PrimitiveScalarType::Int32) => {
- ScalarValue::Int32(Some(*v))
- }
- (Value::Int64Value(v), PrimitiveScalarType::Int64) => {
- ScalarValue::Int64(Some(*v))
- }
- (Value::Uint8Value(v), PrimitiveScalarType::Uint8) => {
- ScalarValue::UInt8(Some(*v as u8))
- }
- (Value::Uint16Value(v), PrimitiveScalarType::Uint16) => {
- ScalarValue::UInt16(Some(*v as u16))
- }
- (Value::Uint32Value(v), PrimitiveScalarType::Uint32) => {
- ScalarValue::UInt32(Some(*v))
- }
- (Value::Uint64Value(v), PrimitiveScalarType::Uint64) => {
- ScalarValue::UInt64(Some(*v))
- }
- (Value::Float32Value(v), PrimitiveScalarType::Float32) => {
- ScalarValue::Float32(Some(*v))
- }
- (Value::Float64Value(v), PrimitiveScalarType::Float64) => {
- ScalarValue::Float64(Some(*v))
- }
- (Value::Date32Value(v), PrimitiveScalarType::Date32) => {
- ScalarValue::Date32(Some(*v))
- }
- (Value::TimeMicrosecondValue(v), PrimitiveScalarType::TimeMicrosecond) => {
- ScalarValue::TimestampMicrosecond(Some(*v))
- }
- (Value::TimeNanosecondValue(v), PrimitiveScalarType::TimeMicrosecond) => {
- ScalarValue::TimestampNanosecond(Some(*v))
- }
- (Value::Utf8Value(v), PrimitiveScalarType::Utf8) => {
- ScalarValue::Utf8(Some(v.to_owned()))
- }
- (Value::LargeUtf8Value(v), PrimitiveScalarType::LargeUtf8) => {
- ScalarValue::LargeUtf8(Some(v.to_owned()))
- }
-
- (Value::NullValue(i32_enum), required_scalar_type) => {
- if *i32_enum == *required_scalar_type as i32 {
- let pb_scalar_type = PrimitiveScalarType::from_i32(*i32_enum).ok_or_else(|| {
- BallistaError::General(format!(
- "Invalid i32_enum={} when converting with PrimitiveScalarType::from_i32()",
- *i32_enum
- ))
- })?;
- let scalar_value: ScalarValue = match pb_scalar_type {
- PrimitiveScalarType::Bool => ScalarValue::Boolean(None),
- PrimitiveScalarType::Uint8 => ScalarValue::UInt8(None),
- PrimitiveScalarType::Int8 => ScalarValue::Int8(None),
- PrimitiveScalarType::Uint16 => ScalarValue::UInt16(None),
- PrimitiveScalarType::Int16 => ScalarValue::Int16(None),
- PrimitiveScalarType::Uint32 => ScalarValue::UInt32(None),
- PrimitiveScalarType::Int32 => ScalarValue::Int32(None),
- PrimitiveScalarType::Uint64 => ScalarValue::UInt64(None),
- PrimitiveScalarType::Int64 => ScalarValue::Int64(None),
- PrimitiveScalarType::Float32 => ScalarValue::Float32(None),
- PrimitiveScalarType::Float64 => ScalarValue::Float64(None),
- PrimitiveScalarType::Utf8 => ScalarValue::Utf8(None),
- PrimitiveScalarType::LargeUtf8 => ScalarValue::LargeUtf8(None),
- PrimitiveScalarType::Date32 => ScalarValue::Date32(None),
- PrimitiveScalarType::TimeMicrosecond => {
- ScalarValue::TimestampMicrosecond(None)
- }
- PrimitiveScalarType::TimeNanosecond => {
- ScalarValue::TimestampNanosecond(None)
- }
- PrimitiveScalarType::Null => {
- return Err(proto_error(
- "Untyped scalar null is not a valid scalar value",
- ))
- }
- };
- scalar_value
- } else {
- return Err(proto_error("Could not convert to the proper type"));
- }
- }
- _ => return Err(proto_error("Could not convert to the proper type")),
- })
-}
-
-impl TryInto<datafusion::scalar::ScalarValue> for &protobuf::scalar_value::Value {
- type Error = BallistaError;
- fn try_into(self) -> Result<datafusion::scalar::ScalarValue, Self::Error> {
- use datafusion::scalar::ScalarValue;
- use protobuf::PrimitiveScalarType;
- let scalar = match self {
- protobuf::scalar_value::Value::BoolValue(v) => ScalarValue::Boolean(Some(*v)),
- protobuf::scalar_value::Value::Utf8Value(v) => {
- ScalarValue::Utf8(Some(v.to_owned()))
- }
- protobuf::scalar_value::Value::LargeUtf8Value(v) => {
- ScalarValue::LargeUtf8(Some(v.to_owned()))
- }
- protobuf::scalar_value::Value::Int8Value(v) => {
- ScalarValue::Int8(Some(*v as i8))
- }
- protobuf::scalar_value::Value::Int16Value(v) => {
- ScalarValue::Int16(Some(*v as i16))
- }
- protobuf::scalar_value::Value::Int32Value(v) => ScalarValue::Int32(Some(*v)),
- protobuf::scalar_value::Value::Int64Value(v) => ScalarValue::Int64(Some(*v)),
- protobuf::scalar_value::Value::Uint8Value(v) => {
- ScalarValue::UInt8(Some(*v as u8))
- }
- protobuf::scalar_value::Value::Uint16Value(v) => {
- ScalarValue::UInt16(Some(*v as u16))
- }
- protobuf::scalar_value::Value::Uint32Value(v) => {
- ScalarValue::UInt32(Some(*v))
- }
- protobuf::scalar_value::Value::Uint64Value(v) => {
- ScalarValue::UInt64(Some(*v))
- }
- protobuf::scalar_value::Value::Float32Value(v) => {
- ScalarValue::Float32(Some(*v))
- }
- protobuf::scalar_value::Value::Float64Value(v) => {
- ScalarValue::Float64(Some(*v))
- }
- protobuf::scalar_value::Value::Date32Value(v) => {
- ScalarValue::Date32(Some(*v))
- }
- protobuf::scalar_value::Value::TimeMicrosecondValue(v) => {
- ScalarValue::TimestampMicrosecond(Some(*v))
- }
- protobuf::scalar_value::Value::TimeNanosecondValue(v) => {
- ScalarValue::TimestampNanosecond(Some(*v))
- }
- protobuf::scalar_value::Value::ListValue(v) => v.try_into()?,
- protobuf::scalar_value::Value::NullListValue(v) => {
- ScalarValue::List(None, v.try_into()?)
- }
- protobuf::scalar_value::Value::NullValue(null_enum) => {
- PrimitiveScalarType::from_i32(*null_enum)
- .ok_or_else(|| proto_error("Invalid scalar type"))?
- .try_into()?
- }
- };
- Ok(scalar)
- }
-}
-
-impl TryInto<datafusion::scalar::ScalarValue> for &protobuf::ScalarListValue {
- type Error = BallistaError;
- fn try_into(self) -> Result<datafusion::scalar::ScalarValue, Self::Error> {
- use protobuf::scalar_type::Datatype;
- use protobuf::PrimitiveScalarType;
- let protobuf::ScalarListValue { datatype, values } = self;
- let pb_scalar_type = datatype
- .as_ref()
- .ok_or_else(|| proto_error("Protobuf deserialization error: ScalarListValue messsage missing required field 'datatype'"))?;
- let scalar_type = pb_scalar_type
- .datatype
- .as_ref()
- .ok_or_else(|| proto_error("Protobuf deserialization error: ScalarListValue.Datatype messsage missing required field 'datatype'"))?;
- let scalar_values = match scalar_type {
- Datatype::Scalar(scalar_type_i32) => {
- let leaf_scalar_type =
- protobuf::PrimitiveScalarType::from_i32(*scalar_type_i32)
- .ok_or_else(|| {
- proto_error("Error converting i32 to basic scalar type")
- })?;
- let typechecked_values: Vec<datafusion::scalar::ScalarValue> = values
- .iter()
- .map(|protobuf::ScalarValue { value: opt_value }| {
- let value = opt_value.as_ref().ok_or_else(|| {
- proto_error(
- "Protobuf deserialization error: missing required field 'value'",
- )
- })?;
- typechecked_scalar_value_conversion(value, leaf_scalar_type)
- })
- .collect::<Result<Vec<_>, _>>()?;
- datafusion::scalar::ScalarValue::List(
- Some(typechecked_values),
- leaf_scalar_type.into(),
- )
- }
- Datatype::List(list_type) => {
- let protobuf::ScalarListType {
- deepest_type,
- field_names,
- } = &list_type;
- let leaf_type =
- PrimitiveScalarType::from_i32(*deepest_type).ok_or_else(|| {
- proto_error("Error converting i32 to basic scalar type")
- })?;
- let depth = field_names.len();
-
- let typechecked_values: Vec<datafusion::scalar::ScalarValue> = if depth
- == 0
- {
- return Err(proto_error(
- "Protobuf deserialization error, ScalarListType had no field names, requires at least one",
- ));
- } else if depth == 1 {
- values
- .iter()
- .map(|protobuf::ScalarValue { value: opt_value }| {
- let value = opt_value
- .as_ref()
- .ok_or_else(|| proto_error("Protobuf deserialization error: missing required field 'value'"))?;
- typechecked_scalar_value_conversion(value, leaf_type)
- })
- .collect::<Result<Vec<_>, _>>()?
- } else {
- values
- .iter()
- .map(|protobuf::ScalarValue { value: opt_value }| {
- let value = opt_value
- .as_ref()
- .ok_or_else(|| proto_error("Protobuf deserialization error: missing required field 'value'"))?;
- value.try_into()
- })
- .collect::<Result<Vec<_>, _>>()?
- };
- datafusion::scalar::ScalarValue::List(
- match typechecked_values.len() {
- 0 => None,
- _ => Some(typechecked_values),
- },
- list_type.try_into()?,
- )
- }
- };
- Ok(scalar_values)
- }
-}
-
-impl TryInto<arrow::datatypes::DataType> for &protobuf::ScalarListType {
- type Error = BallistaError;
- fn try_into(self) -> Result<arrow::datatypes::DataType, Self::Error> {
- use protobuf::PrimitiveScalarType;
- let protobuf::ScalarListType {
- deepest_type,
- field_names,
- } = self;
-
- let depth = field_names.len();
- if depth == 0 {
- return Err(proto_error(
- "Protobuf deserialization error: Found a ScalarListType message with no field names, at least one is required",
- ));
- }
-
- let mut curr_type = arrow::datatypes::DataType::List(Box::new(Field::new(
- //Since checked vector is not empty above this is safe to unwrap
- field_names.last().unwrap(),
- PrimitiveScalarType::from_i32(*deepest_type)
- .ok_or_else(|| {
- proto_error("Could not convert to datafusion scalar type")
- })?
- .into(),
- true,
- )));
- //Iterates over field names in reverse order except for the last item in the vector
- for name in field_names.iter().rev().skip(1) {
- let temp_curr_type = arrow::datatypes::DataType::List(Box::new(Field::new(
- name, curr_type, true,
- )));
- curr_type = temp_curr_type;
- }
- Ok(curr_type)
- }
-}
-
-impl TryInto<datafusion::scalar::ScalarValue> for protobuf::PrimitiveScalarType {
- type Error = BallistaError;
- fn try_into(self) -> Result<datafusion::scalar::ScalarValue, Self::Error> {
- use datafusion::scalar::ScalarValue;
- Ok(match self {
- protobuf::PrimitiveScalarType::Null => {
- return Err(proto_error("Untyped null is an invalid scalar value"))
- }
- protobuf::PrimitiveScalarType::Bool => ScalarValue::Boolean(None),
- protobuf::PrimitiveScalarType::Uint8 => ScalarValue::UInt8(None),
- protobuf::PrimitiveScalarType::Int8 => ScalarValue::Int8(None),
- protobuf::PrimitiveScalarType::Uint16 => ScalarValue::UInt16(None),
- protobuf::PrimitiveScalarType::Int16 => ScalarValue::Int16(None),
- protobuf::PrimitiveScalarType::Uint32 => ScalarValue::UInt32(None),
- protobuf::PrimitiveScalarType::Int32 => ScalarValue::Int32(None),
- protobuf::PrimitiveScalarType::Uint64 => ScalarValue::UInt64(None),
- protobuf::PrimitiveScalarType::Int64 => ScalarValue::Int64(None),
- protobuf::PrimitiveScalarType::Float32 => ScalarValue::Float32(None),
- protobuf::PrimitiveScalarType::Float64 => ScalarValue::Float64(None),
- protobuf::PrimitiveScalarType::Utf8 => ScalarValue::Utf8(None),
- protobuf::PrimitiveScalarType::LargeUtf8 => ScalarValue::LargeUtf8(None),
- protobuf::PrimitiveScalarType::Date32 => ScalarValue::Date32(None),
- protobuf::PrimitiveScalarType::TimeMicrosecond => {
- ScalarValue::TimestampMicrosecond(None)
- }
- protobuf::PrimitiveScalarType::TimeNanosecond => {
- ScalarValue::TimestampNanosecond(None)
- }
- })
- }
-}
-
-impl TryInto<datafusion::scalar::ScalarValue> for &protobuf::ScalarValue {
- type Error = BallistaError;
- fn try_into(self) -> Result<datafusion::scalar::ScalarValue, Self::Error> {
- let value = self.value.as_ref().ok_or_else(|| {
- proto_error("Protobuf deserialization error: missing required field 'value'")
- })?;
- Ok(match value {
- protobuf::scalar_value::Value::BoolValue(v) => ScalarValue::Boolean(Some(*v)),
- protobuf::scalar_value::Value::Utf8Value(v) => {
- ScalarValue::Utf8(Some(v.to_owned()))
- }
- protobuf::scalar_value::Value::LargeUtf8Value(v) => {
- ScalarValue::LargeUtf8(Some(v.to_owned()))
- }
- protobuf::scalar_value::Value::Int8Value(v) => {
- ScalarValue::Int8(Some(*v as i8))
- }
- protobuf::scalar_value::Value::Int16Value(v) => {
- ScalarValue::Int16(Some(*v as i16))
- }
- protobuf::scalar_value::Value::Int32Value(v) => ScalarValue::Int32(Some(*v)),
- protobuf::scalar_value::Value::Int64Value(v) => ScalarValue::Int64(Some(*v)),
- protobuf::scalar_value::Value::Uint8Value(v) => {
- ScalarValue::UInt8(Some(*v as u8))
- }
- protobuf::scalar_value::Value::Uint16Value(v) => {
- ScalarValue::UInt16(Some(*v as u16))
- }
- protobuf::scalar_value::Value::Uint32Value(v) => {
- ScalarValue::UInt32(Some(*v))
- }
- protobuf::scalar_value::Value::Uint64Value(v) => {
- ScalarValue::UInt64(Some(*v))
- }
- protobuf::scalar_value::Value::Float32Value(v) => {
- ScalarValue::Float32(Some(*v))
- }
- protobuf::scalar_value::Value::Float64Value(v) => {
- ScalarValue::Float64(Some(*v))
- }
- protobuf::scalar_value::Value::Date32Value(v) => {
- ScalarValue::Date32(Some(*v))
- }
- protobuf::scalar_value::Value::TimeMicrosecondValue(v) => {
- ScalarValue::TimestampMicrosecond(Some(*v))
- }
- protobuf::scalar_value::Value::TimeNanosecondValue(v) => {
- ScalarValue::TimestampNanosecond(Some(*v))
- }
- protobuf::scalar_value::Value::ListValue(scalar_list) => {
- let protobuf::ScalarListValue {
- values,
- datatype: opt_scalar_type,
- } = &scalar_list;
- let pb_scalar_type = opt_scalar_type
- .as_ref()
- .ok_or_else(|| proto_error("Protobuf deserialization err: ScalaListValue missing required field 'datatype'"))?;
- let typechecked_values: Vec<ScalarValue> = values
- .iter()
- .map(|val| val.try_into())
- .collect::<Result<Vec<_>, _>>()?;
- let scalar_type: arrow::datatypes::DataType =
- pb_scalar_type.try_into()?;
- ScalarValue::List(Some(typechecked_values), scalar_type)
- }
- protobuf::scalar_value::Value::NullListValue(v) => {
- let pb_datatype = v
- .datatype
- .as_ref()
- .ok_or_else(|| proto_error("Protobuf deserialization error: NullListValue message missing required field 'datatyp'"))?;
- ScalarValue::List(None, pb_datatype.try_into()?)
- }
- protobuf::scalar_value::Value::NullValue(v) => {
- let null_type_enum = protobuf::PrimitiveScalarType::from_i32(*v)
- .ok_or_else(|| proto_error("Protobuf deserialization error found invalid enum variant for DatafusionScalar"))?;
- null_type_enum.try_into()?
- }
- })
- }
-}
-
-impl TryInto<Expr> for &protobuf::LogicalExprNode {
- type Error = BallistaError;
-
- fn try_into(self) -> Result<Expr, Self::Error> {
- use protobuf::logical_expr_node::ExprType;
-
- let expr_type = self
- .expr_type
- .as_ref()
- .ok_or_else(|| proto_error("Unexpected empty logical expression"))?;
- match expr_type {
- ExprType::BinaryExpr(binary_expr) => Ok(Expr::BinaryExpr {
- left: Box::new(parse_required_expr(&binary_expr.l)?),
- op: from_proto_binary_op(&binary_expr.op)?,
- right: Box::new(parse_required_expr(&binary_expr.r)?),
- }),
- ExprType::ColumnName(column_name) => Ok(Expr::Column(column_name.to_owned())),
- ExprType::Literal(literal) => {
- use datafusion::scalar::ScalarValue;
- let scalar_value: datafusion::scalar::ScalarValue = literal.try_into()?;
- Ok(Expr::Literal(scalar_value))
- }
- ExprType::AggregateExpr(expr) => {
- let aggr_function =
- protobuf::AggregateFunction::from_i32(expr.aggr_function)
- .ok_or_else(|| {
- proto_error(format!(
- "Received an unknown aggregate function: {}",
- expr.aggr_function
- ))
- })?;
- let fun = match aggr_function {
- protobuf::AggregateFunction::Min => AggregateFunction::Min,
- protobuf::AggregateFunction::Max => AggregateFunction::Max,
- protobuf::AggregateFunction::Sum => AggregateFunction::Sum,
- protobuf::AggregateFunction::Avg => AggregateFunction::Avg,
- protobuf::AggregateFunction::Count => AggregateFunction::Count,
- };
-
- Ok(Expr::AggregateFunction {
- fun,
- args: vec![parse_required_expr(&expr.expr)?],
- distinct: false, //TODO
- })
- }
- ExprType::Alias(alias) => Ok(Expr::Alias(
- Box::new(parse_required_expr(&alias.expr)?),
- alias.alias.clone(),
- )),
- ExprType::IsNullExpr(is_null) => {
- Ok(Expr::IsNull(Box::new(parse_required_expr(&is_null.expr)?)))
- }
- ExprType::IsNotNullExpr(is_not_null) => Ok(Expr::IsNotNull(Box::new(
- parse_required_expr(&is_not_null.expr)?,
- ))),
- ExprType::NotExpr(not) => {
- Ok(Expr::Not(Box::new(parse_required_expr(¬.expr)?)))
- }
- ExprType::Between(between) => Ok(Expr::Between {
- expr: Box::new(parse_required_expr(&between.expr)?),
- negated: between.negated,
- low: Box::new(parse_required_expr(&between.low)?),
- high: Box::new(parse_required_expr(&between.high)?),
- }),
- ExprType::Case(case) => {
- let when_then_expr = case
- .when_then_expr
- .iter()
- .map(|e| {
- Ok((
- Box::new(match &e.when_expr {
- Some(e) => e.try_into(),
- None => Err(proto_error("Missing required expression")),
- }?),
- Box::new(match &e.then_expr {
- Some(e) => e.try_into(),
- None => Err(proto_error("Missing required expression")),
- }?),
- ))
- })
- .collect::<Result<Vec<(Box<Expr>, Box<Expr>)>, BallistaError>>()?;
- Ok(Expr::Case {
- expr: parse_optional_expr(&case.expr)?.map(Box::new),
- when_then_expr,
- else_expr: parse_optional_expr(&case.else_expr)?.map(Box::new),
- })
- }
- ExprType::Cast(cast) => {
- let expr = Box::new(parse_required_expr(&cast.expr)?);
- let arrow_type: &protobuf::ArrowType = cast
- .arrow_type
- .as_ref()
- .ok_or_else(|| proto_error("Protobuf deserialization error: CastNode message missing required field 'arrow_type'"))?;
- let data_type = arrow_type.try_into()?;
- Ok(Expr::Cast { expr, data_type })
- }
- ExprType::TryCast(cast) => {
- let expr = Box::new(parse_required_expr(&cast.expr)?);
- let arrow_type: &protobuf::ArrowType = cast
- .arrow_type
- .as_ref()
- .ok_or_else(|| proto_error("Protobuf deserialization error: CastNode message missing required field 'arrow_type'"))?;
- let data_type = arrow_type.try_into()?;
- Ok(Expr::TryCast { expr, data_type })
- }
- ExprType::Sort(sort) => Ok(Expr::Sort {
- expr: Box::new(parse_required_expr(&sort.expr)?),
- asc: sort.asc,
- nulls_first: sort.nulls_first,
- }),
- ExprType::Negative(negative) => Ok(Expr::Negative(Box::new(
- parse_required_expr(&negative.expr)?,
- ))),
- ExprType::InList(in_list) => Ok(Expr::InList {
- expr: Box::new(parse_required_expr(&in_list.expr)?),
- list: in_list
- .list
- .iter()
- .map(|expr| expr.try_into())
- .collect::<Result<Vec<_>, _>>()?,
- negated: in_list.negated,
- }),
- ExprType::Wildcard(_) => Ok(Expr::Wildcard),
- ExprType::ScalarFunction(expr) => {
- let scalar_function = protobuf::ScalarFunction::from_i32(expr.fun)
- .ok_or_else(|| {
- proto_error(format!(
- "Received an unknown scalar function: {}",
- expr.fun
- ))
- })?;
- match scalar_function {
- protobuf::ScalarFunction::Sqrt => {
- Ok(sqrt((&expr.expr[0]).try_into()?))
- }
- protobuf::ScalarFunction::Sin => Ok(sin((&expr.expr[0]).try_into()?)),
- protobuf::ScalarFunction::Cos => Ok(cos((&expr.expr[0]).try_into()?)),
- protobuf::ScalarFunction::Tan => Ok(tan((&expr.expr[0]).try_into()?)),
- // protobuf::ScalarFunction::Asin => Ok(asin(&expr.expr[0]).try_into()?)),
- // protobuf::ScalarFunction::Acos => Ok(acos(&expr.expr[0]).try_into()?)),
- protobuf::ScalarFunction::Atan => {
- Ok(atan((&expr.expr[0]).try_into()?))
- }
- protobuf::ScalarFunction::Exp => Ok(exp((&expr.expr[0]).try_into()?)),
- protobuf::ScalarFunction::Log2 => {
- Ok(log2((&expr.expr[0]).try_into()?))
- }
- protobuf::ScalarFunction::Log10 => {
- Ok(log10((&expr.expr[0]).try_into()?))
- }
- protobuf::ScalarFunction::Floor => {
- Ok(floor((&expr.expr[0]).try_into()?))
- }
- protobuf::ScalarFunction::Ceil => {
- Ok(ceil((&expr.expr[0]).try_into()?))
- }
- protobuf::ScalarFunction::Round => {
- Ok(round((&expr.expr[0]).try_into()?))
- }
- protobuf::ScalarFunction::Trunc => {
- Ok(trunc((&expr.expr[0]).try_into()?))
- }
- protobuf::ScalarFunction::Abs => Ok(abs((&expr.expr[0]).try_into()?)),
- protobuf::ScalarFunction::Signum => {
- Ok(signum((&expr.expr[0]).try_into()?))
- }
- protobuf::ScalarFunction::Octetlength => {
- Ok(length((&expr.expr[0]).try_into()?))
- }
- // // protobuf::ScalarFunction::Concat => Ok(concat((&expr.expr[0]).try_into()?)),
- protobuf::ScalarFunction::Lower => {
- Ok(lower((&expr.expr[0]).try_into()?))
- }
- protobuf::ScalarFunction::Upper => {
- Ok(upper((&expr.expr[0]).try_into()?))
- }
- protobuf::ScalarFunction::Trim => {
- Ok(trim((&expr.expr[0]).try_into()?))
- }
- protobuf::ScalarFunction::Ltrim => {
- Ok(ltrim((&expr.expr[0]).try_into()?))
- }
- protobuf::ScalarFunction::Rtrim => {
- Ok(rtrim((&expr.expr[0]).try_into()?))
- }
- // protobuf::ScalarFunction::Totimestamp => Ok(to_timestamp((&expr.expr[0]).try_into()?)),
- // protobuf::ScalarFunction::Array => Ok(array((&expr.expr[0]).try_into()?)),
- // // protobuf::ScalarFunction::Nullif => Ok(nulli((&expr.expr[0]).try_into()?)),
- // protobuf::ScalarFunction::Datetrunc => Ok(date_trunc((&expr.expr[0]).try_into()?)),
- // protobuf::ScalarFunction::Md5 => Ok(md5((&expr.expr[0]).try_into()?)),
- protobuf::ScalarFunction::Sha224 => {
- Ok(sha224((&expr.expr[0]).try_into()?))
- }
- protobuf::ScalarFunction::Sha256 => {
- Ok(sha256((&expr.expr[0]).try_into()?))
- }
- protobuf::ScalarFunction::Sha384 => {
- Ok(sha384((&expr.expr[0]).try_into()?))
- }
- protobuf::ScalarFunction::Sha512 => {
- Ok(sha512((&expr.expr[0]).try_into()?))
- }
- _ => Err(proto_error(
- "Protobuf deserialization error: Unsupported scalar function",
- )),
- }
- }
- }
- }
-}
-
-fn from_proto_binary_op(op: &str) -> Result<Operator, BallistaError> {
- match op {
- "And" => Ok(Operator::And),
- "Or" => Ok(Operator::Or),
- "Eq" => Ok(Operator::Eq),
- "NotEq" => Ok(Operator::NotEq),
- "LtEq" => Ok(Operator::LtEq),
- "Lt" => Ok(Operator::Lt),
- "Gt" => Ok(Operator::Gt),
- "GtEq" => Ok(Operator::GtEq),
- "Plus" => Ok(Operator::Plus),
- "Minus" => Ok(Operator::Minus),
- "Multiply" => Ok(Operator::Multiply),
- "Divide" => Ok(Operator::Divide),
- "Like" => Ok(Operator::Like),
- other => Err(proto_error(format!(
- "Unsupported binary operator '{:?}'",
- other
- ))),
- }
-}
-
-impl TryInto<arrow::datatypes::DataType> for &protobuf::ScalarType {
- type Error = BallistaError;
- fn try_into(self) -> Result<arrow::datatypes::DataType, Self::Error> {
- let pb_scalartype = self.datatype.as_ref().ok_or_else(|| {
- proto_error("ScalarType message missing required field 'datatype'")
- })?;
- pb_scalartype.try_into()
- }
-}
-
-impl TryInto<Schema> for &protobuf::Schema {
- type Error = BallistaError;
-
- fn try_into(self) -> Result<Schema, BallistaError> {
- let fields = self
- .columns
- .iter()
- .map(|c| {
- let pb_arrow_type_res = c
- .arrow_type
- .as_ref()
- .ok_or_else(|| proto_error("Protobuf deserialization error: Field message was missing required field 'arrow_type'"));
- let pb_arrow_type: &protobuf::ArrowType = match pb_arrow_type_res {
- Ok(res) => res,
- Err(e) => return Err(e),
- };
- Ok(Field::new(&c.name, pb_arrow_type.try_into()?, c.nullable))
- })
- .collect::<Result<Vec<_>, _>>()?;
- Ok(Schema::new(fields))
- }
-}
-
-impl TryInto<arrow::datatypes::Field> for &protobuf::Field {
- type Error = BallistaError;
- fn try_into(self) -> Result<arrow::datatypes::Field, Self::Error> {
- let pb_datatype = self.arrow_type.as_ref().ok_or_else(|| {
- proto_error(
- "Protobuf deserialization error: Field message missing required field 'arrow_type'",
- )
- })?;
-
- Ok(arrow::datatypes::Field::new(
- self.name.as_str(),
- pb_datatype.as_ref().try_into()?,
- self.nullable,
- ))
- }
-}
-
-use datafusion::physical_plan::datetime_expressions::{date_trunc, to_timestamp};
-use datafusion::prelude::{
- array, length, lower, ltrim, md5, rtrim, sha224, sha256, sha384, sha512, trim, upper,
-};
-use std::convert::TryFrom;
-
-impl TryFrom<i32> for protobuf::FileType {
- type Error = BallistaError;
- fn try_from(value: i32) -> Result<Self, Self::Error> {
- use protobuf::FileType;
- match value {
- _x if _x == FileType::NdJson as i32 => Ok(FileType::NdJson),
- _x if _x == FileType::Parquet as i32 => Ok(FileType::Parquet),
- _x if _x == FileType::Csv as i32 => Ok(FileType::Csv),
- invalid => Err(BallistaError::General(format!(
- "Attempted to convert invalid i32 to protobuf::Filetype: {}",
- invalid
- ))),
- }
- }
-}
-
-impl Into<datafusion::sql::parser::FileType> for protobuf::FileType {
- fn into(self) -> datafusion::sql::parser::FileType {
- use datafusion::sql::parser::FileType;
- match self {
- protobuf::FileType::NdJson => FileType::NdJson,
- protobuf::FileType::Parquet => FileType::Parquet,
- protobuf::FileType::Csv => FileType::CSV,
- }
- }
-}
-
-fn parse_required_expr(
- p: &Option<Box<protobuf::LogicalExprNode>>,
-) -> Result<Expr, BallistaError> {
- match p {
- Some(expr) => expr.as_ref().try_into(),
- None => Err(proto_error("Missing required expression")),
- }
-}
-
-fn parse_optional_expr(
- p: &Option<Box<protobuf::LogicalExprNode>>,
-) -> Result<Option<Expr>, BallistaError> {
- match p {
- Some(expr) => expr.as_ref().try_into().map(Some),
- None => Ok(None),
- }
-}
diff --git a/rust/ballista/rust/core/src/serde/logical_plan/mod.rs b/rust/ballista/rust/core/src/serde/logical_plan/mod.rs
deleted file mode 100644
index 48dd96c..0000000
--- a/rust/ballista/rust/core/src/serde/logical_plan/mod.rs
+++ /dev/null
@@ -1,929 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-pub mod from_proto;
-pub mod to_proto;
-
-#[cfg(test)]
-
-mod roundtrip_tests {
-
- use super::super::{super::error::Result, protobuf};
- use crate::error::BallistaError;
- use arrow::datatypes::{DataType, Field, Schema};
- use core::panic;
- use datafusion::physical_plan::functions::BuiltinScalarFunction::Sqrt;
- use datafusion::{
- logical_plan::{Expr, LogicalPlan, LogicalPlanBuilder},
- physical_plan::csv::CsvReadOptions,
- prelude::*,
- scalar::ScalarValue,
- };
- use protobuf::arrow_type;
- use std::convert::TryInto;
-
- //Given a identity of a LogicalPlan converts it to protobuf and back, using debug formatting to test equality.
- macro_rules! roundtrip_test {
- ($initial_struct:ident, $proto_type:ty, $struct_type:ty) => {
- let proto: $proto_type = (&$initial_struct).try_into()?;
-
- let round_trip: $struct_type = (&proto).try_into()?;
-
- assert_eq!(
- format!("{:?}", $initial_struct),
- format!("{:?}", round_trip)
- );
- };
- ($initial_struct:ident, $struct_type:ty) => {
- roundtrip_test!($initial_struct, protobuf::LogicalPlanNode, $struct_type);
- };
- ($initial_struct:ident) => {
- roundtrip_test!($initial_struct, protobuf::LogicalPlanNode, LogicalPlan);
- };
- }
-
- #[test]
-
- fn roundtrip_repartition() -> Result<()> {
- use datafusion::logical_plan::Partitioning;
-
- let test_batch_sizes = [usize::MIN, usize::MAX, 43256];
-
- let test_expr: Vec<Expr> = vec![
- Expr::Column("c1".to_string()) + Expr::Column("c2".to_string()),
- Expr::Literal((4.0).into()),
- ];
-
- let schema = Schema::new(vec![
- Field::new("id", DataType::Int32, false),
- Field::new("first_name", DataType::Utf8, false),
- Field::new("last_name", DataType::Utf8, false),
- Field::new("state", DataType::Utf8, false),
- Field::new("salary", DataType::Int32, false),
- ]);
-
- let plan = std::sync::Arc::new(
- LogicalPlanBuilder::scan_csv(
- "employee.csv",
- CsvReadOptions::new().schema(&schema).has_header(true),
- Some(vec![3, 4]),
- )
- .and_then(|plan| plan.sort(vec![col("salary")]))
- .and_then(|plan| plan.build())
- .map_err(BallistaError::DataFusionError)?,
- );
-
- for batch_size in test_batch_sizes.iter() {
- let rr_repartition = Partitioning::RoundRobinBatch(*batch_size);
-
- let roundtrip_plan = LogicalPlan::Repartition {
- input: plan.clone(),
- partitioning_scheme: rr_repartition,
- };
-
- roundtrip_test!(roundtrip_plan);
-
- let h_repartition = Partitioning::Hash(test_expr.clone(), *batch_size);
-
- let roundtrip_plan = LogicalPlan::Repartition {
- input: plan.clone(),
- partitioning_scheme: h_repartition,
- };
-
- roundtrip_test!(roundtrip_plan);
-
- let no_expr_hrepartition = Partitioning::Hash(Vec::new(), *batch_size);
-
- let roundtrip_plan = LogicalPlan::Repartition {
- input: plan.clone(),
- partitioning_scheme: no_expr_hrepartition,
- };
-
- roundtrip_test!(roundtrip_plan);
- }
-
- Ok(())
- }
-
- fn new_box_field(
- name: &str,
- dt: DataType,
- nullable: bool,
- ) -> Box<arrow::datatypes::Field> {
- Box::new(arrow::datatypes::Field::new(name, dt, nullable))
- }
-
- #[test]
- fn scalar_values_error_serialization() -> Result<()> {
- use arrow::datatypes::DataType;
- use datafusion::scalar::ScalarValue;
- let should_fail_on_seralize: Vec<ScalarValue> = vec![
- //Should fail due to inconsistent types
- ScalarValue::List(
- Some(vec![
- ScalarValue::Int16(None),
- ScalarValue::Float32(Some(32.0)),
- ]),
- DataType::List(new_box_field("item", DataType::Int16, true)),
- ),
- ScalarValue::List(
- Some(vec![
- ScalarValue::Float32(None),
- ScalarValue::Float32(Some(32.0)),
- ]),
- DataType::List(new_box_field("item", DataType::Int16, true)),
- ),
- ScalarValue::List(
- Some(vec![
- ScalarValue::List(
- None,
- DataType::List(new_box_field("level2", DataType::Float32, true)),
- ),
- ScalarValue::List(
- Some(vec![
- ScalarValue::Float32(Some(-213.1)),
- ScalarValue::Float32(None),
- ScalarValue::Float32(Some(5.5)),
- ScalarValue::Float32(Some(2.0)),
- ScalarValue::Float32(Some(1.0)),
- ]),
- DataType::List(new_box_field("level2", DataType::Float32, true)),
- ),
- ScalarValue::List(
- None,
- DataType::List(new_box_field(
- "lists are typed inconsistently",
- DataType::Int16,
- true,
- )),
- ),
- ]),
- DataType::List(new_box_field(
- "level1",
- DataType::List(new_box_field("level2", DataType::Float32, true)),
- true,
- )),
- ),
- ];
-
- for test_case in should_fail_on_seralize.into_iter() {
- let res: Result<protobuf::ScalarValue> = (&test_case).try_into();
- if let Ok(val) = res {
- return Err(BallistaError::General(format!(
- "The value {:?} should not have been able to serialize. Serialized to :{:?}",
- test_case, val
- )));
- }
- }
- Ok(())
- }
-
- #[test]
- fn round_trip_scalar_values() -> Result<()> {
- use arrow::datatypes::DataType;
- use datafusion::scalar::ScalarValue;
- let should_pass: Vec<ScalarValue> = vec![
- ScalarValue::Boolean(None),
- ScalarValue::Float32(None),
- ScalarValue::Float64(None),
- ScalarValue::Int8(None),
- ScalarValue::Int16(None),
- ScalarValue::Int32(None),
- ScalarValue::Int64(None),
- ScalarValue::UInt8(None),
- ScalarValue::UInt16(None),
- ScalarValue::UInt32(None),
- ScalarValue::UInt64(None),
- ScalarValue::Utf8(None),
- ScalarValue::LargeUtf8(None),
- ScalarValue::List(None, DataType::Boolean),
- ScalarValue::Date32(None),
- ScalarValue::TimestampMicrosecond(None),
- ScalarValue::TimestampNanosecond(None),
- ScalarValue::Boolean(Some(true)),
- ScalarValue::Boolean(Some(false)),
- ScalarValue::Float32(Some(1.0)),
- ScalarValue::Float32(Some(f32::MAX)),
- ScalarValue::Float32(Some(f32::MIN)),
- ScalarValue::Float32(Some(-2000.0)),
- ScalarValue::Float64(Some(1.0)),
- ScalarValue::Float64(Some(f64::MAX)),
- ScalarValue::Float64(Some(f64::MIN)),
- ScalarValue::Float64(Some(-2000.0)),
- ScalarValue::Int8(Some(i8::MIN)),
- ScalarValue::Int8(Some(i8::MAX)),
- ScalarValue::Int8(Some(0)),
- ScalarValue::Int8(Some(-15)),
- ScalarValue::Int16(Some(i16::MIN)),
- ScalarValue::Int16(Some(i16::MAX)),
- ScalarValue::Int16(Some(0)),
- ScalarValue::Int16(Some(-15)),
- ScalarValue::Int32(Some(i32::MIN)),
- ScalarValue::Int32(Some(i32::MAX)),
- ScalarValue::Int32(Some(0)),
- ScalarValue::Int32(Some(-15)),
- ScalarValue::Int64(Some(i64::MIN)),
- ScalarValue::Int64(Some(i64::MAX)),
- ScalarValue::Int64(Some(0)),
- ScalarValue::Int64(Some(-15)),
- ScalarValue::UInt8(Some(u8::MAX)),
- ScalarValue::UInt8(Some(0)),
- ScalarValue::UInt16(Some(u16::MAX)),
- ScalarValue::UInt16(Some(0)),
- ScalarValue::UInt32(Some(u32::MAX)),
- ScalarValue::UInt32(Some(0)),
- ScalarValue::UInt64(Some(u64::MAX)),
- ScalarValue::UInt64(Some(0)),
- ScalarValue::Utf8(Some(String::from("Test string "))),
- ScalarValue::LargeUtf8(Some(String::from("Test Large utf8"))),
- ScalarValue::Date32(Some(0)),
- ScalarValue::Date32(Some(i32::MAX)),
- ScalarValue::TimestampNanosecond(Some(0)),
- ScalarValue::TimestampNanosecond(Some(i64::MAX)),
- ScalarValue::TimestampMicrosecond(Some(0)),
- ScalarValue::TimestampMicrosecond(Some(i64::MAX)),
- ScalarValue::TimestampMicrosecond(None),
- ScalarValue::List(
- Some(vec![
- ScalarValue::Float32(Some(-213.1)),
- ScalarValue::Float32(None),
- ScalarValue::Float32(Some(5.5)),
- ScalarValue::Float32(Some(2.0)),
- ScalarValue::Float32(Some(1.0)),
- ]),
- DataType::List(new_box_field("level1", DataType::Float32, true)),
- ),
- ScalarValue::List(
- Some(vec![
- ScalarValue::List(
- None,
- DataType::List(new_box_field("level2", DataType::Float32, true)),
- ),
- ScalarValue::List(
- Some(vec![
- ScalarValue::Float32(Some(-213.1)),
- ScalarValue::Float32(None),
- ScalarValue::Float32(Some(5.5)),
- ScalarValue::Float32(Some(2.0)),
- ScalarValue::Float32(Some(1.0)),
- ]),
- DataType::List(new_box_field("level2", DataType::Float32, true)),
- ),
- ]),
- DataType::List(new_box_field(
- "level1",
- DataType::List(new_box_field("level2", DataType::Float32, true)),
- true,
- )),
- ),
- ];
-
- for test_case in should_pass.into_iter() {
- let proto: protobuf::ScalarValue = (&test_case).try_into()?;
- let _roundtrip: ScalarValue = (&proto).try_into()?;
- }
-
- Ok(())
- }
-
- #[test]
- fn round_trip_scalar_types() -> Result<()> {
- use arrow::datatypes::DataType;
- use arrow::datatypes::{IntervalUnit, TimeUnit};
- let should_pass: Vec<DataType> = vec![
- DataType::Boolean,
- DataType::Int8,
- DataType::Int16,
- DataType::Int32,
- DataType::Int64,
- DataType::UInt8,
- DataType::UInt16,
- DataType::UInt32,
- DataType::UInt64,
- DataType::Float32,
- DataType::Float64,
- DataType::Date32,
- DataType::Time64(TimeUnit::Microsecond),
- DataType::Time64(TimeUnit::Nanosecond),
- DataType::Utf8,
- DataType::LargeUtf8,
- //Recursive list tests
- DataType::List(new_box_field("Level1", DataType::Boolean, true)),
- DataType::List(new_box_field(
- "Level1",
- DataType::List(new_box_field("Level2", DataType::Date32, true)),
- true,
- )),
- ];
-
- let should_fail: Vec<DataType> = vec![
- DataType::Null,
- DataType::Float16,
- //Add more timestamp tests
- DataType::Timestamp(TimeUnit::Millisecond, None),
- DataType::Date64,
- DataType::Time32(TimeUnit::Second),
- DataType::Time32(TimeUnit::Millisecond),
- DataType::Time32(TimeUnit::Microsecond),
- DataType::Time32(TimeUnit::Nanosecond),
- DataType::Time64(TimeUnit::Second),
- DataType::Time64(TimeUnit::Millisecond),
- DataType::Duration(TimeUnit::Second),
- DataType::Duration(TimeUnit::Millisecond),
- DataType::Duration(TimeUnit::Microsecond),
- DataType::Duration(TimeUnit::Nanosecond),
- DataType::Interval(IntervalUnit::YearMonth),
- DataType::Interval(IntervalUnit::DayTime),
- DataType::Binary,
- DataType::FixedSizeBinary(0),
- DataType::FixedSizeBinary(1234),
- DataType::FixedSizeBinary(-432),
- DataType::LargeBinary,
- DataType::Decimal(1345, 5431),
- //Recursive list tests
- DataType::List(new_box_field("Level1", DataType::Binary, true)),
- DataType::List(new_box_field(
- "Level1",
- DataType::List(new_box_field(
- "Level2",
- DataType::FixedSizeBinary(53),
- false,
- )),
- true,
- )),
- //Fixed size lists
- DataType::FixedSizeList(new_box_field("Level1", DataType::Binary, true), 4),
- DataType::FixedSizeList(
- new_box_field(
- "Level1",
- DataType::List(new_box_field(
- "Level2",
- DataType::FixedSizeBinary(53),
- false,
- )),
- true,
- ),
- 41,
- ),
- //Struct Testing
- DataType::Struct(vec![
- Field::new("nullable", DataType::Boolean, false),
- Field::new("name", DataType::Utf8, false),
- Field::new("datatype", DataType::Binary, false),
- ]),
- DataType::Struct(vec![
- Field::new("nullable", DataType::Boolean, false),
- Field::new("name", DataType::Utf8, false),
- Field::new("datatype", DataType::Binary, false),
- Field::new(
- "nested_struct",
- DataType::Struct(vec![
- Field::new("nullable", DataType::Boolean, false),
- Field::new("name", DataType::Utf8, false),
- Field::new("datatype", DataType::Binary, false),
- ]),
- true,
- ),
- ]),
- DataType::Union(vec![
- Field::new("nullable", DataType::Boolean, false),
- Field::new("name", DataType::Utf8, false),
- Field::new("datatype", DataType::Binary, false),
- ]),
- DataType::Union(vec![
- Field::new("nullable", DataType::Boolean, false),
- Field::new("name", DataType::Utf8, false),
- Field::new("datatype", DataType::Binary, false),
- Field::new(
- "nested_struct",
- DataType::Struct(vec![
- Field::new("nullable", DataType::Boolean, false),
- Field::new("name", DataType::Utf8, false),
- Field::new("datatype", DataType::Binary, false),
- ]),
- true,
- ),
- ]),
- DataType::Dictionary(
- Box::new(DataType::Utf8),
- Box::new(DataType::Struct(vec![
- Field::new("nullable", DataType::Boolean, false),
- Field::new("name", DataType::Utf8, false),
- Field::new("datatype", DataType::Binary, false),
- ])),
- ),
- DataType::Dictionary(
- Box::new(DataType::Decimal(10, 50)),
- Box::new(DataType::FixedSizeList(
- new_box_field("Level1", DataType::Binary, true),
- 4,
- )),
- ),
- ];
-
- for test_case in should_pass.into_iter() {
- let proto: protobuf::ScalarType = (&test_case).try_into()?;
- let roundtrip: DataType = (&proto).try_into()?;
- assert_eq!(format!("{:?}", test_case), format!("{:?}", roundtrip));
- }
-
- let mut success: Vec<DataType> = Vec::new();
- for test_case in should_fail.into_iter() {
- let proto: Result<protobuf::ScalarType> = (&test_case).try_into();
- if proto.is_ok() {
- success.push(test_case)
- }
- }
- if !success.is_empty() {
- return Err(BallistaError::General(format!(
- "The following items which should have ressulted in an error completed successfully: {:?}",
- success
- )));
- }
- Ok(())
- }
-
- #[test]
- fn round_trip_datatype() -> Result<()> {
- use arrow::datatypes::DataType;
- use arrow::datatypes::{IntervalUnit, TimeUnit};
- let test_cases: Vec<DataType> = vec![
- DataType::Null,
- DataType::Boolean,
- DataType::Int8,
- DataType::Int16,
- DataType::Int32,
- DataType::Int64,
- DataType::UInt8,
- DataType::UInt16,
- DataType::UInt32,
- DataType::UInt64,
- DataType::Float16,
- DataType::Float32,
- DataType::Float64,
- //Add more timestamp tests
- DataType::Timestamp(TimeUnit::Millisecond, None),
- DataType::Date32,
- DataType::Date64,
- DataType::Time32(TimeUnit::Second),
- DataType::Time32(TimeUnit::Millisecond),
- DataType::Time32(TimeUnit::Microsecond),
- DataType::Time32(TimeUnit::Nanosecond),
- DataType::Time64(TimeUnit::Second),
- DataType::Time64(TimeUnit::Millisecond),
- DataType::Time64(TimeUnit::Microsecond),
- DataType::Time64(TimeUnit::Nanosecond),
- DataType::Duration(TimeUnit::Second),
- DataType::Duration(TimeUnit::Millisecond),
- DataType::Duration(TimeUnit::Microsecond),
- DataType::Duration(TimeUnit::Nanosecond),
- DataType::Interval(IntervalUnit::YearMonth),
- DataType::Interval(IntervalUnit::DayTime),
- DataType::Binary,
- DataType::FixedSizeBinary(0),
- DataType::FixedSizeBinary(1234),
- DataType::FixedSizeBinary(-432),
- DataType::LargeBinary,
- DataType::Utf8,
- DataType::LargeUtf8,
- DataType::Decimal(1345, 5431),
- //Recursive list tests
- DataType::List(new_box_field("Level1", DataType::Binary, true)),
- DataType::List(new_box_field(
- "Level1",
- DataType::List(new_box_field(
- "Level2",
- DataType::FixedSizeBinary(53),
- false,
- )),
- true,
- )),
- //Fixed size lists
- DataType::FixedSizeList(new_box_field("Level1", DataType::Binary, true), 4),
- DataType::FixedSizeList(
- new_box_field(
- "Level1",
- DataType::List(new_box_field(
- "Level2",
- DataType::FixedSizeBinary(53),
- false,
- )),
- true,
- ),
- 41,
- ),
- //Struct Testing
- DataType::Struct(vec![
- Field::new("nullable", DataType::Boolean, false),
- Field::new("name", DataType::Utf8, false),
- Field::new("datatype", DataType::Binary, false),
- ]),
- DataType::Struct(vec![
- Field::new("nullable", DataType::Boolean, false),
- Field::new("name", DataType::Utf8, false),
- Field::new("datatype", DataType::Binary, false),
- Field::new(
- "nested_struct",
- DataType::Struct(vec![
- Field::new("nullable", DataType::Boolean, false),
- Field::new("name", DataType::Utf8, false),
- Field::new("datatype", DataType::Binary, false),
- ]),
- true,
- ),
- ]),
- DataType::Union(vec![
- Field::new("nullable", DataType::Boolean, false),
- Field::new("name", DataType::Utf8, false),
- Field::new("datatype", DataType::Binary, false),
- ]),
- DataType::Union(vec![
- Field::new("nullable", DataType::Boolean, false),
- Field::new("name", DataType::Utf8, false),
- Field::new("datatype", DataType::Binary, false),
- Field::new(
- "nested_struct",
- DataType::Struct(vec![
- Field::new("nullable", DataType::Boolean, false),
- Field::new("name", DataType::Utf8, false),
- Field::new("datatype", DataType::Binary, false),
- ]),
- true,
- ),
- ]),
- DataType::Dictionary(
- Box::new(DataType::Utf8),
- Box::new(DataType::Struct(vec![
- Field::new("nullable", DataType::Boolean, false),
- Field::new("name", DataType::Utf8, false),
- Field::new("datatype", DataType::Binary, false),
- ])),
- ),
- DataType::Dictionary(
- Box::new(DataType::Decimal(10, 50)),
- Box::new(DataType::FixedSizeList(
- new_box_field("Level1", DataType::Binary, true),
- 4,
- )),
- ),
- ];
-
- for test_case in test_cases.into_iter() {
- let proto: protobuf::ArrowType = (&test_case).into();
- let roundtrip: DataType = (&proto).try_into()?;
- assert_eq!(format!("{:?}", test_case), format!("{:?}", roundtrip));
- }
- Ok(())
- }
-
- #[test]
- fn roundtrip_null_scalar_values() -> Result<()> {
- use arrow::datatypes::DataType;
- use arrow::datatypes::Field;
- use datafusion::scalar::ScalarValue;
- let test_types = vec![
- ScalarValue::Boolean(None),
- ScalarValue::Float32(None),
- ScalarValue::Float64(None),
- ScalarValue::Int8(None),
- ScalarValue::Int16(None),
- ScalarValue::Int32(None),
- ScalarValue::Int64(None),
- ScalarValue::UInt8(None),
- ScalarValue::UInt16(None),
- ScalarValue::UInt32(None),
- ScalarValue::UInt64(None),
- ScalarValue::Utf8(None),
- ScalarValue::LargeUtf8(None),
- ScalarValue::Date32(None),
- ScalarValue::TimestampMicrosecond(None),
- ScalarValue::TimestampNanosecond(None),
- //ScalarValue::List(None, DataType::Boolean)
- ];
-
- for test_case in test_types.into_iter() {
- let proto_scalar: protobuf::ScalarValue = (&test_case).try_into()?;
- let returned_scalar: datafusion::scalar::ScalarValue =
- (&proto_scalar).try_into()?;
- assert_eq!(
- format!("{:?}", &test_case),
- format!("{:?}", returned_scalar)
- );
- }
-
- Ok(())
- }
-
- #[test]
-
- fn roundtrip_create_external_table() -> Result<()> {
- let schema = Schema::new(vec![
- Field::new("id", DataType::Int32, false),
- Field::new("first_name", DataType::Utf8, false),
- Field::new("last_name", DataType::Utf8, false),
- Field::new("state", DataType::Utf8, false),
- Field::new("salary", DataType::Int32, false),
- ]);
-
- use datafusion::logical_plan::ToDFSchema;
-
- let df_schema_ref = schema.to_dfschema_ref()?;
-
- use datafusion::sql::parser::FileType;
-
- let filetypes: [FileType; 3] =
- [FileType::NdJson, FileType::Parquet, FileType::CSV];
-
- for file in filetypes.iter() {
- let create_table_node = LogicalPlan::CreateExternalTable {
- schema: df_schema_ref.clone(),
- name: String::from("TestName"),
- location: String::from("employee.csv"),
- file_type: *file,
- has_header: true,
- };
-
- roundtrip_test!(create_table_node);
- }
-
- Ok(())
- }
-
- #[test]
-
- fn roundtrip_explain() -> Result<()> {
- let schema = Schema::new(vec![
- Field::new("id", DataType::Int32, false),
- Field::new("first_name", DataType::Utf8, false),
- Field::new("last_name", DataType::Utf8, false),
- Field::new("state", DataType::Utf8, false),
- Field::new("salary", DataType::Int32, false),
- ]);
-
- let verbose_plan = LogicalPlanBuilder::scan_csv(
- "employee.csv",
- CsvReadOptions::new().schema(&schema).has_header(true),
- Some(vec![3, 4]),
- )
- .and_then(|plan| plan.sort(vec![col("salary")]))
- .and_then(|plan| plan.explain(true))
- .and_then(|plan| plan.build())
- .map_err(BallistaError::DataFusionError)?;
-
- let plan = LogicalPlanBuilder::scan_csv(
- "employee.csv",
- CsvReadOptions::new().schema(&schema).has_header(true),
- Some(vec![3, 4]),
- )
- .and_then(|plan| plan.sort(vec![col("salary")]))
- .and_then(|plan| plan.explain(false))
- .and_then(|plan| plan.build())
- .map_err(BallistaError::DataFusionError)?;
-
- roundtrip_test!(plan);
-
- roundtrip_test!(verbose_plan);
-
- Ok(())
- }
-
- #[test]
- fn roundtrip_join() -> Result<()> {
- let schema = Schema::new(vec![
- Field::new("id", DataType::Int32, false),
- Field::new("first_name", DataType::Utf8, false),
- Field::new("last_name", DataType::Utf8, false),
- Field::new("state", DataType::Utf8, false),
- Field::new("salary", DataType::Int32, false),
- ]);
-
- let scan_plan = LogicalPlanBuilder::empty(false)
- .build()
- .map_err(BallistaError::DataFusionError)?;
- let plan = LogicalPlanBuilder::scan_csv(
- "employee.csv",
- CsvReadOptions::new().schema(&schema).has_header(true),
- Some(vec![3, 4]),
- )
- .and_then(|plan| plan.join(&scan_plan, JoinType::Inner, &["id"], &["id"]))
- .and_then(|plan| plan.build())
- .map_err(BallistaError::DataFusionError)?;
-
- roundtrip_test!(plan);
- Ok(())
- }
-
- #[test]
- fn roundtrip_sort() -> Result<()> {
- let schema = Schema::new(vec![
- Field::new("id", DataType::Int32, false),
- Field::new("first_name", DataType::Utf8, false),
- Field::new("last_name", DataType::Utf8, false),
- Field::new("state", DataType::Utf8, false),
- Field::new("salary", DataType::Int32, false),
- ]);
-
- let plan = LogicalPlanBuilder::scan_csv(
- "employee.csv",
- CsvReadOptions::new().schema(&schema).has_header(true),
- Some(vec![3, 4]),
- )
- .and_then(|plan| plan.sort(vec![col("salary")]))
- .and_then(|plan| plan.build())
- .map_err(BallistaError::DataFusionError)?;
- roundtrip_test!(plan);
-
- Ok(())
- }
-
- #[test]
-
- fn roundtrip_empty_relation() -> Result<()> {
- let plan_false = LogicalPlanBuilder::empty(false)
- .build()
- .map_err(BallistaError::DataFusionError)?;
-
- roundtrip_test!(plan_false);
-
- let plan_true = LogicalPlanBuilder::empty(true)
- .build()
- .map_err(BallistaError::DataFusionError)?;
-
- roundtrip_test!(plan_true);
-
- Ok(())
- }
-
- #[test]
-
- fn roundtrip_logical_plan() -> Result<()> {
- let schema = Schema::new(vec![
- Field::new("id", DataType::Int32, false),
- Field::new("first_name", DataType::Utf8, false),
- Field::new("last_name", DataType::Utf8, false),
- Field::new("state", DataType::Utf8, false),
- Field::new("salary", DataType::Int32, false),
- ]);
-
- let plan = LogicalPlanBuilder::scan_csv(
- "employee.csv",
- CsvReadOptions::new().schema(&schema).has_header(true),
- Some(vec![3, 4]),
- )
- .and_then(|plan| plan.aggregate(vec![col("state")], vec![max(col("salary"))]))
- .and_then(|plan| plan.build())
- .map_err(BallistaError::DataFusionError)?;
-
- roundtrip_test!(plan);
-
- Ok(())
- }
-
- #[test]
-
- fn roundtrip_not() -> Result<()> {
- let test_expr = Expr::Not(Box::new(Expr::Literal((1.0).into())));
-
- roundtrip_test!(test_expr, protobuf::LogicalExprNode, Expr);
-
- Ok(())
- }
-
- #[test]
-
- fn roundtrip_is_null() -> Result<()> {
- let test_expr = Expr::IsNull(Box::new(Expr::Column("id".into())));
-
- roundtrip_test!(test_expr, protobuf::LogicalExprNode, Expr);
-
- Ok(())
- }
-
- #[test]
-
- fn roundtrip_is_not_null() -> Result<()> {
- let test_expr = Expr::IsNotNull(Box::new(Expr::Column("id".into())));
-
- roundtrip_test!(test_expr, protobuf::LogicalExprNode, Expr);
-
- Ok(())
- }
-
- #[test]
-
- fn roundtrip_between() -> Result<()> {
- let test_expr = Expr::Between {
- expr: Box::new(Expr::Literal((1.0).into())),
- negated: true,
- low: Box::new(Expr::Literal((2.0).into())),
- high: Box::new(Expr::Literal((3.0).into())),
- };
-
- roundtrip_test!(test_expr, protobuf::LogicalExprNode, Expr);
-
- Ok(())
- }
-
- #[test]
-
- fn roundtrip_case() -> Result<()> {
- let test_expr = Expr::Case {
- expr: Some(Box::new(Expr::Literal((1.0).into()))),
- when_then_expr: vec![(
- Box::new(Expr::Literal((2.0).into())),
- Box::new(Expr::Literal((3.0).into())),
- )],
- else_expr: Some(Box::new(Expr::Literal((4.0).into()))),
- };
-
- roundtrip_test!(test_expr, protobuf::LogicalExprNode, Expr);
-
- Ok(())
- }
-
- #[test]
-
- fn roundtrip_cast() -> Result<()> {
- let test_expr = Expr::Cast {
- expr: Box::new(Expr::Literal((1.0).into())),
- data_type: DataType::Boolean,
- };
-
- roundtrip_test!(test_expr, protobuf::LogicalExprNode, Expr);
-
- Ok(())
- }
-
- #[test]
-
- fn roundtrip_sort_expr() -> Result<()> {
- let test_expr = Expr::Sort {
- expr: Box::new(Expr::Literal((1.0).into())),
- asc: true,
- nulls_first: true,
- };
-
- roundtrip_test!(test_expr, protobuf::LogicalExprNode, Expr);
-
- Ok(())
- }
-
- #[test]
-
- fn roundtrip_negative() -> Result<()> {
- let test_expr = Expr::Negative(Box::new(Expr::Literal((1.0).into())));
-
- roundtrip_test!(test_expr, protobuf::LogicalExprNode, Expr);
-
- Ok(())
- }
-
- #[test]
-
- fn roundtrip_inlist() -> Result<()> {
- let test_expr = Expr::InList {
- expr: Box::new(Expr::Literal((1.0).into())),
- list: vec![Expr::Literal((2.0).into())],
- negated: true,
- };
-
- roundtrip_test!(test_expr, protobuf::LogicalExprNode, Expr);
-
- Ok(())
- }
-
- #[test]
-
- fn roundtrip_wildcard() -> Result<()> {
- let test_expr = Expr::Wildcard;
-
- roundtrip_test!(test_expr, protobuf::LogicalExprNode, Expr);
-
- Ok(())
- }
-
- #[test]
- fn roundtrip_sqrt() -> Result<()> {
- let test_expr = Expr::ScalarFunction {
- fun: Sqrt,
- args: vec![col("col")],
- };
- roundtrip_test!(test_expr, protobuf::LogicalExprNode, Expr);
-
- Ok(())
- }
-}
diff --git a/rust/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/rust/ballista/rust/core/src/serde/logical_plan/to_proto.rs
deleted file mode 100644
index a181f98..0000000
--- a/rust/ballista/rust/core/src/serde/logical_plan/to_proto.rs
+++ /dev/null
@@ -1,1233 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Serde code to convert Arrow schemas and DataFusion logical plans to Ballista protocol
-//! buffer format, allowing DataFusion logical plans to be serialized and transmitted between
-//! processes.
-
-use std::{
- boxed,
- convert::{TryFrom, TryInto},
-};
-
-use crate::datasource::DFTableAdapter;
-use crate::serde::{protobuf, BallistaError};
-
-use arrow::datatypes::{DataType, Schema};
-use datafusion::datasource::CsvFile;
-use datafusion::logical_plan::{Expr, JoinType, LogicalPlan};
-use datafusion::physical_plan::aggregates::AggregateFunction;
-use datafusion::{datasource::parquet::ParquetTable, logical_plan::exprlist_to_fields};
-use protobuf::{
- arrow_type, logical_expr_node::ExprType, scalar_type, DateUnit, Field,
- PrimitiveScalarType, ScalarListValue, ScalarType,
-};
-
-use super::super::proto_error;
-use datafusion::physical_plan::functions::BuiltinScalarFunction;
-
-impl protobuf::IntervalUnit {
- pub fn from_arrow_interval_unit(
- interval_unit: &arrow::datatypes::IntervalUnit,
- ) -> Self {
- match interval_unit {
- arrow::datatypes::IntervalUnit::YearMonth => {
- protobuf::IntervalUnit::YearMonth
- }
- arrow::datatypes::IntervalUnit::DayTime => protobuf::IntervalUnit::DayTime,
- }
- }
-
- pub fn from_i32_to_arrow(
- interval_unit_i32: i32,
- ) -> Result<arrow::datatypes::IntervalUnit, BallistaError> {
- let pb_interval_unit = protobuf::IntervalUnit::from_i32(interval_unit_i32);
- use arrow::datatypes::IntervalUnit;
- match pb_interval_unit {
- Some(interval_unit) => Ok(match interval_unit {
- protobuf::IntervalUnit::YearMonth => IntervalUnit::YearMonth,
- protobuf::IntervalUnit::DayTime => IntervalUnit::DayTime,
- }),
- None => Err(proto_error(
- "Error converting i32 to DateUnit: Passed invalid variant",
- )),
- }
- }
-}
-/* Arrow changed dates to no longer have date unit
-
-impl protobuf::DateUnit {
- pub fn from_arrow_date_unit(val: &arrow::datatypes::DateUnit) -> Self {
- match val {
- arrow::datatypes::DateUnit::Day => protobuf::DateUnit::Day,
- arrow::datatypes::DateUnit::Millisecond => protobuf::DateUnit::DateMillisecond,
- }
- }
- pub fn from_i32_to_arrow(date_unit_i32: i32) -> Result<arrow::datatypes::DateUnit, BallistaError> {
- let pb_date_unit = protobuf::DateUnit::from_i32(date_unit_i32);
- use arrow::datatypes::DateUnit;
- match pb_date_unit {
- Some(date_unit) => Ok(match date_unit {
- protobuf::DateUnit::Day => DateUnit::Day,
- protobuf::DateUnit::DateMillisecond => DateUnit::Millisecond,
- }),
- None => Err(proto_error("Error converting i32 to DateUnit: Passed invalid variant")),
- }
- }
-
-}*/
-
-impl protobuf::TimeUnit {
- pub fn from_arrow_time_unit(val: &arrow::datatypes::TimeUnit) -> Self {
- match val {
- arrow::datatypes::TimeUnit::Second => protobuf::TimeUnit::Second,
- arrow::datatypes::TimeUnit::Millisecond => {
- protobuf::TimeUnit::TimeMillisecond
- }
- arrow::datatypes::TimeUnit::Microsecond => protobuf::TimeUnit::Microsecond,
- arrow::datatypes::TimeUnit::Nanosecond => protobuf::TimeUnit::Nanosecond,
- }
- }
- pub fn from_i32_to_arrow(
- time_unit_i32: i32,
- ) -> Result<arrow::datatypes::TimeUnit, BallistaError> {
- let pb_time_unit = protobuf::TimeUnit::from_i32(time_unit_i32);
- use arrow::datatypes::TimeUnit;
- match pb_time_unit {
- Some(time_unit) => Ok(match time_unit {
- protobuf::TimeUnit::Second => TimeUnit::Second,
- protobuf::TimeUnit::TimeMillisecond => TimeUnit::Millisecond,
- protobuf::TimeUnit::Microsecond => TimeUnit::Microsecond,
- protobuf::TimeUnit::Nanosecond => TimeUnit::Nanosecond,
- }),
- None => Err(proto_error(
- "Error converting i32 to TimeUnit: Passed invalid variant",
- )),
- }
- }
-}
-
-impl From<&arrow::datatypes::Field> for protobuf::Field {
- fn from(field: &arrow::datatypes::Field) -> Self {
- protobuf::Field {
- name: field.name().to_owned(),
- arrow_type: Some(Box::new(field.data_type().into())),
- nullable: field.is_nullable(),
- children: Vec::new(),
- }
- }
-}
-
-impl From<&arrow::datatypes::DataType> for protobuf::ArrowType {
- fn from(val: &arrow::datatypes::DataType) -> protobuf::ArrowType {
- protobuf::ArrowType {
- arrow_type_enum: Some(val.into()),
- }
- }
-}
-
-impl TryInto<arrow::datatypes::DataType> for &protobuf::ArrowType {
- type Error = BallistaError;
- fn try_into(self) -> Result<arrow::datatypes::DataType, Self::Error> {
- let pb_arrow_type = self.arrow_type_enum.as_ref().ok_or_else(|| {
- proto_error(
- "Protobuf deserialization error: ArrowType missing required field 'data_type'",
- )
- })?;
- use arrow::datatypes::DataType;
- Ok(match pb_arrow_type {
- protobuf::arrow_type::ArrowTypeEnum::None(_) => DataType::Null,
- protobuf::arrow_type::ArrowTypeEnum::Bool(_) => DataType::Boolean,
- protobuf::arrow_type::ArrowTypeEnum::Uint8(_) => DataType::UInt8,
- protobuf::arrow_type::ArrowTypeEnum::Int8(_) => DataType::Int8,
- protobuf::arrow_type::ArrowTypeEnum::Uint16(_) => DataType::UInt16,
- protobuf::arrow_type::ArrowTypeEnum::Int16(_) => DataType::Int16,
- protobuf::arrow_type::ArrowTypeEnum::Uint32(_) => DataType::UInt32,
- protobuf::arrow_type::ArrowTypeEnum::Int32(_) => DataType::Int32,
- protobuf::arrow_type::ArrowTypeEnum::Uint64(_) => DataType::UInt64,
- protobuf::arrow_type::ArrowTypeEnum::Int64(_) => DataType::Int64,
- protobuf::arrow_type::ArrowTypeEnum::Float16(_) => DataType::Float16,
- protobuf::arrow_type::ArrowTypeEnum::Float32(_) => DataType::Float32,
- protobuf::arrow_type::ArrowTypeEnum::Float64(_) => DataType::Float64,
- protobuf::arrow_type::ArrowTypeEnum::Utf8(_) => DataType::Utf8,
- protobuf::arrow_type::ArrowTypeEnum::LargeUtf8(_) => DataType::LargeUtf8,
- protobuf::arrow_type::ArrowTypeEnum::Binary(_) => DataType::Binary,
- protobuf::arrow_type::ArrowTypeEnum::FixedSizeBinary(size) => {
- DataType::FixedSizeBinary(*size)
- }
- protobuf::arrow_type::ArrowTypeEnum::LargeBinary(_) => DataType::LargeBinary,
- protobuf::arrow_type::ArrowTypeEnum::Date32(_) => DataType::Date32,
- protobuf::arrow_type::ArrowTypeEnum::Date64(_) => DataType::Date64,
- protobuf::arrow_type::ArrowTypeEnum::Duration(time_unit_i32) => {
- DataType::Duration(protobuf::TimeUnit::from_i32_to_arrow(*time_unit_i32)?)
- }
- protobuf::arrow_type::ArrowTypeEnum::Timestamp(timestamp) => {
- DataType::Timestamp(
- protobuf::TimeUnit::from_i32_to_arrow(timestamp.time_unit)?,
- match timestamp.timezone.is_empty() {
- true => None,
- false => Some(timestamp.timezone.to_owned()),
- },
- )
- }
- protobuf::arrow_type::ArrowTypeEnum::Time32(time_unit_i32) => {
- DataType::Time32(protobuf::TimeUnit::from_i32_to_arrow(*time_unit_i32)?)
- }
- protobuf::arrow_type::ArrowTypeEnum::Time64(time_unit_i32) => {
- DataType::Time64(protobuf::TimeUnit::from_i32_to_arrow(*time_unit_i32)?)
- }
- protobuf::arrow_type::ArrowTypeEnum::Interval(interval_unit_i32) => {
- DataType::Interval(protobuf::IntervalUnit::from_i32_to_arrow(
- *interval_unit_i32,
- )?)
- }
- protobuf::arrow_type::ArrowTypeEnum::Decimal(protobuf::Decimal {
- whole,
- fractional,
- }) => DataType::Decimal(*whole as usize, *fractional as usize),
- protobuf::arrow_type::ArrowTypeEnum::List(boxed_list) => {
- let field_ref = boxed_list
- .field_type
- .as_ref()
- .ok_or_else(|| proto_error("Protobuf deserialization error: List message was missing required field 'field_type'"))?
- .as_ref();
- arrow::datatypes::DataType::List(Box::new(field_ref.try_into()?))
- }
- protobuf::arrow_type::ArrowTypeEnum::LargeList(boxed_list) => {
- let field_ref = boxed_list
- .field_type
- .as_ref()
- .ok_or_else(|| proto_error("Protobuf deserialization error: List message was missing required field 'field_type'"))?
- .as_ref();
- arrow::datatypes::DataType::LargeList(Box::new(field_ref.try_into()?))
- }
- protobuf::arrow_type::ArrowTypeEnum::FixedSizeList(boxed_list) => {
- let fsl_ref = boxed_list.as_ref();
- let pb_fieldtype = fsl_ref
- .field_type
- .as_ref()
- .ok_or_else(|| proto_error("Protobuf deserialization error: FixedSizeList message was missing required field 'field_type'"))?;
- arrow::datatypes::DataType::FixedSizeList(
- Box::new(pb_fieldtype.as_ref().try_into()?),
- fsl_ref.list_size,
- )
- }
- protobuf::arrow_type::ArrowTypeEnum::Struct(struct_type) => {
- let fields = struct_type
- .sub_field_types
- .iter()
- .map(|field| field.try_into())
- .collect::<Result<Vec<_>, _>>()?;
- arrow::datatypes::DataType::Struct(fields)
- }
- protobuf::arrow_type::ArrowTypeEnum::Union(union) => {
- let union_types = union
- .union_types
- .iter()
- .map(|field| field.try_into())
- .collect::<Result<Vec<_>, _>>()?;
- arrow::datatypes::DataType::Union(union_types)
- }
- protobuf::arrow_type::ArrowTypeEnum::Dictionary(boxed_dict) => {
- let dict_ref = boxed_dict.as_ref();
- let pb_key = dict_ref
- .key
- .as_ref()
- .ok_or_else(|| proto_error("Protobuf deserialization error: Dictionary message was missing required field 'key'"))?;
- let pb_value = dict_ref
- .value
- .as_ref()
- .ok_or_else(|| proto_error("Protobuf deserialization error: Dictionary message was missing required field 'value'"))?;
- arrow::datatypes::DataType::Dictionary(
- Box::new(pb_key.as_ref().try_into()?),
- Box::new(pb_value.as_ref().try_into()?),
- )
- }
- })
- }
-}
-
-impl TryInto<arrow::datatypes::DataType> for &Box<protobuf::List> {
- type Error = BallistaError;
- fn try_into(self) -> Result<arrow::datatypes::DataType, Self::Error> {
- let list_ref = self.as_ref();
- match &list_ref.field_type {
- Some(pb_field) => {
- let pb_field_ref = pb_field.as_ref();
- let arrow_field: arrow::datatypes::Field = pb_field_ref.try_into()?;
- Ok(arrow::datatypes::DataType::List(Box::new(arrow_field)))
- }
- None => Err(proto_error(
- "List message missing required field 'field_type'",
- )),
- }
- }
-}
-
-impl From<&arrow::datatypes::DataType> for protobuf::arrow_type::ArrowTypeEnum {
- fn from(val: &arrow::datatypes::DataType) -> protobuf::arrow_type::ArrowTypeEnum {
- use protobuf::arrow_type::ArrowTypeEnum;
- use protobuf::ArrowType;
- use protobuf::EmptyMessage;
- match val {
- DataType::Null => ArrowTypeEnum::None(EmptyMessage {}),
- DataType::Boolean => ArrowTypeEnum::Bool(EmptyMessage {}),
- DataType::Int8 => ArrowTypeEnum::Int8(EmptyMessage {}),
- DataType::Int16 => ArrowTypeEnum::Int16(EmptyMessage {}),
- DataType::Int32 => ArrowTypeEnum::Int32(EmptyMessage {}),
- DataType::Int64 => ArrowTypeEnum::Int64(EmptyMessage {}),
- DataType::UInt8 => ArrowTypeEnum::Uint8(EmptyMessage {}),
- DataType::UInt16 => ArrowTypeEnum::Uint16(EmptyMessage {}),
- DataType::UInt32 => ArrowTypeEnum::Uint32(EmptyMessage {}),
- DataType::UInt64 => ArrowTypeEnum::Uint64(EmptyMessage {}),
- DataType::Float16 => ArrowTypeEnum::Float16(EmptyMessage {}),
- DataType::Float32 => ArrowTypeEnum::Float32(EmptyMessage {}),
- DataType::Float64 => ArrowTypeEnum::Float64(EmptyMessage {}),
- DataType::Timestamp(time_unit, timezone) => {
- ArrowTypeEnum::Timestamp(protobuf::Timestamp {
- time_unit: protobuf::TimeUnit::from_arrow_time_unit(time_unit) as i32,
- timezone: timezone.to_owned().unwrap_or_else(String::new),
- })
- }
- DataType::Date32 => ArrowTypeEnum::Date32(EmptyMessage {}),
- DataType::Date64 => ArrowTypeEnum::Date64(EmptyMessage {}),
- DataType::Time32(time_unit) => ArrowTypeEnum::Time32(
- protobuf::TimeUnit::from_arrow_time_unit(time_unit) as i32,
- ),
- DataType::Time64(time_unit) => ArrowTypeEnum::Time64(
- protobuf::TimeUnit::from_arrow_time_unit(time_unit) as i32,
- ),
- DataType::Duration(time_unit) => ArrowTypeEnum::Duration(
- protobuf::TimeUnit::from_arrow_time_unit(time_unit) as i32,
- ),
- DataType::Interval(interval_unit) => ArrowTypeEnum::Interval(
- protobuf::IntervalUnit::from_arrow_interval_unit(interval_unit) as i32,
- ),
- DataType::Binary => ArrowTypeEnum::Binary(EmptyMessage {}),
- DataType::FixedSizeBinary(size) => ArrowTypeEnum::FixedSizeBinary(*size),
- DataType::LargeBinary => ArrowTypeEnum::LargeBinary(EmptyMessage {}),
- DataType::Utf8 => ArrowTypeEnum::Utf8(EmptyMessage {}),
- DataType::LargeUtf8 => ArrowTypeEnum::LargeUtf8(EmptyMessage {}),
- DataType::List(item_type) => ArrowTypeEnum::List(Box::new(protobuf::List {
- field_type: Some(Box::new(item_type.as_ref().into())),
- })),
- DataType::FixedSizeList(item_type, size) => {
- ArrowTypeEnum::FixedSizeList(Box::new(protobuf::FixedSizeList {
- field_type: Some(Box::new(item_type.as_ref().into())),
- list_size: *size,
- }))
- }
- DataType::LargeList(item_type) => {
- ArrowTypeEnum::LargeList(Box::new(protobuf::List {
- field_type: Some(Box::new(item_type.as_ref().into())),
- }))
- }
- DataType::Struct(struct_fields) => ArrowTypeEnum::Struct(protobuf::Struct {
- sub_field_types: struct_fields
- .iter()
- .map(|field| field.into())
- .collect::<Vec<_>>(),
- }),
- DataType::Union(union_types) => ArrowTypeEnum::Union(protobuf::Union {
- union_types: union_types
- .iter()
- .map(|field| field.into())
- .collect::<Vec<_>>(),
- }),
- DataType::Dictionary(key_type, value_type) => {
- ArrowTypeEnum::Dictionary(Box::new(protobuf::Dictionary {
- key: Some(Box::new(key_type.as_ref().into())),
- value: Some(Box::new(value_type.as_ref().into())),
- }))
- }
- DataType::Decimal(whole, fractional) => {
- ArrowTypeEnum::Decimal(protobuf::Decimal {
- whole: *whole as u64,
- fractional: *fractional as u64,
- })
- }
- }
- }
-}
-
-//Does not check if list subtypes are valid
-fn is_valid_scalar_type_no_list_check(datatype: &arrow::datatypes::DataType) -> bool {
- match datatype {
- DataType::Boolean
- | DataType::Int8
- | DataType::Int16
- | DataType::Int32
- | DataType::Int64
- | DataType::UInt8
- | DataType::UInt16
- | DataType::UInt32
- | DataType::UInt64
- | DataType::Float32
- | DataType::Float64
- | DataType::LargeUtf8
- | DataType::Utf8
- | DataType::Date32 => true,
- DataType::Time64(time_unit) => matches!(
- time_unit,
- arrow::datatypes::TimeUnit::Microsecond
- | arrow::datatypes::TimeUnit::Nanosecond
- ),
-
- DataType::List(_) => true,
- _ => false,
- }
-}
-
-impl TryFrom<&arrow::datatypes::DataType> for protobuf::scalar_type::Datatype {
- type Error = BallistaError;
- fn try_from(val: &arrow::datatypes::DataType) -> Result<Self, Self::Error> {
- use protobuf::scalar_type;
- use protobuf::Field;
- use protobuf::{List, PrimitiveScalarType};
- let scalar_value = match val {
- DataType::Boolean => scalar_type::Datatype::Scalar(PrimitiveScalarType::Bool as i32),
- DataType::Int8 => scalar_type::Datatype::Scalar(PrimitiveScalarType::Int8 as i32),
- DataType::Int16 => scalar_type::Datatype::Scalar(PrimitiveScalarType::Int16 as i32),
- DataType::Int32 => scalar_type::Datatype::Scalar(PrimitiveScalarType::Int32 as i32),
- DataType::Int64 => scalar_type::Datatype::Scalar(PrimitiveScalarType::Int64 as i32),
- DataType::UInt8 => scalar_type::Datatype::Scalar(PrimitiveScalarType::Uint8 as i32),
- DataType::UInt16 => scalar_type::Datatype::Scalar(PrimitiveScalarType::Uint16 as i32),
- DataType::UInt32 => scalar_type::Datatype::Scalar(PrimitiveScalarType::Uint32 as i32),
- DataType::UInt64 => scalar_type::Datatype::Scalar(PrimitiveScalarType::Uint64 as i32),
- DataType::Float32 => scalar_type::Datatype::Scalar(PrimitiveScalarType::Float32 as i32),
- DataType::Float64 => scalar_type::Datatype::Scalar(PrimitiveScalarType::Float64 as i32),
- DataType::Date32 => scalar_type::Datatype::Scalar(PrimitiveScalarType::Date32 as i32),
- DataType::Time64(time_unit) => match time_unit {
- arrow::datatypes::TimeUnit::Microsecond => scalar_type::Datatype::Scalar(PrimitiveScalarType::TimeMicrosecond as i32),
- arrow::datatypes::TimeUnit::Nanosecond => scalar_type::Datatype::Scalar(PrimitiveScalarType::TimeNanosecond as i32),
- _ => {
- return Err(proto_error(format!(
- "Found invalid time unit for scalar value, only TimeUnit::Microsecond and TimeUnit::Nanosecond are valid time units: {:?}",
- time_unit
- )))
- }
- },
- DataType::Utf8 => scalar_type::Datatype::Scalar(PrimitiveScalarType::Utf8 as i32),
- DataType::LargeUtf8 => scalar_type::Datatype::Scalar(PrimitiveScalarType::LargeUtf8 as i32),
- DataType::List(field_type) => {
- let mut field_names: Vec<String> = Vec::new();
- let mut curr_field: &arrow::datatypes::Field = field_type.as_ref();
- field_names.push(curr_field.name().to_owned());
- //For each nested field check nested datatype, since datafusion scalars only support recursive lists with a leaf scalar type
- // any other compound types are errors.
-
- while let DataType::List(nested_field_type) = curr_field.data_type() {
- curr_field = nested_field_type.as_ref();
- field_names.push(curr_field.name().to_owned());
- if !is_valid_scalar_type_no_list_check(curr_field.data_type()) {
- return Err(proto_error(format!("{:?} is an invalid scalar type", curr_field)));
- }
- }
- let deepest_datatype = curr_field.data_type();
- if !is_valid_scalar_type_no_list_check(deepest_datatype) {
- return Err(proto_error(format!("The list nested type {:?} is an invalid scalar type", curr_field)));
- }
- let pb_deepest_type: PrimitiveScalarType = match deepest_datatype {
- DataType::Boolean => PrimitiveScalarType::Bool,
- DataType::Int8 => PrimitiveScalarType::Int8,
- DataType::Int16 => PrimitiveScalarType::Int16,
- DataType::Int32 => PrimitiveScalarType::Int32,
- DataType::Int64 => PrimitiveScalarType::Int64,
- DataType::UInt8 => PrimitiveScalarType::Uint8,
- DataType::UInt16 => PrimitiveScalarType::Uint16,
- DataType::UInt32 => PrimitiveScalarType::Uint32,
- DataType::UInt64 => PrimitiveScalarType::Uint64,
- DataType::Float32 => PrimitiveScalarType::Float32,
- DataType::Float64 => PrimitiveScalarType::Float64,
- DataType::Date32 => PrimitiveScalarType::Date32,
- DataType::Time64(time_unit) => match time_unit {
- arrow::datatypes::TimeUnit::Microsecond => PrimitiveScalarType::TimeMicrosecond,
- arrow::datatypes::TimeUnit::Nanosecond => PrimitiveScalarType::TimeNanosecond,
- _ => {
- return Err(proto_error(format!(
- "Found invalid time unit for scalar value, only TimeUnit::Microsecond and TimeUnit::Nanosecond are valid time units: {:?}",
- time_unit
- )))
- }
- },
-
- DataType::Utf8 => PrimitiveScalarType::Utf8,
- DataType::LargeUtf8 => PrimitiveScalarType::LargeUtf8,
- _ => {
- return Err(proto_error(format!(
- "Error converting to Datatype to scalar type, {:?} is invalid as a datafusion scalar.",
- val
- )))
- }
- };
- protobuf::scalar_type::Datatype::List(protobuf::ScalarListType {
- field_names,
- deepest_type: pb_deepest_type as i32,
- })
- }
- DataType::Null
- | DataType::Float16
- | DataType::Timestamp(_, _)
- | DataType::Date64
- | DataType::Time32(_)
- | DataType::Duration(_)
- | DataType::Interval(_)
- | DataType::Binary
- | DataType::FixedSizeBinary(_)
- | DataType::LargeBinary
- | DataType::FixedSizeList(_, _)
- | DataType::LargeList(_)
- | DataType::Struct(_)
- | DataType::Union(_)
- | DataType::Dictionary(_, _)
- | DataType::Decimal(_, _) => {
- return Err(proto_error(format!(
- "Error converting to Datatype to scalar type, {:?} is invalid as a datafusion scalar.",
- val
- )))
- }
- };
- Ok(scalar_value)
- }
-}
-
-impl TryFrom<&datafusion::scalar::ScalarValue> for protobuf::ScalarValue {
- type Error = BallistaError;
- fn try_from(
- val: &datafusion::scalar::ScalarValue,
- ) -> Result<protobuf::ScalarValue, Self::Error> {
- use datafusion::scalar;
- use protobuf::scalar_value::Value;
- use protobuf::PrimitiveScalarType;
- let scalar_val = match val {
- scalar::ScalarValue::Boolean(val) => {
- create_proto_scalar(val, PrimitiveScalarType::Bool, |s| Value::BoolValue(*s))
- }
- scalar::ScalarValue::Float32(val) => {
- create_proto_scalar(val, PrimitiveScalarType::Float32, |s| {
- Value::Float32Value(*s)
- })
- }
- scalar::ScalarValue::Float64(val) => {
- create_proto_scalar(val, PrimitiveScalarType::Float64, |s| {
- Value::Float64Value(*s)
- })
- }
- scalar::ScalarValue::Int8(val) => {
- create_proto_scalar(val, PrimitiveScalarType::Int8, |s| {
- Value::Int8Value(*s as i32)
- })
- }
- scalar::ScalarValue::Int16(val) => {
- create_proto_scalar(val, PrimitiveScalarType::Int16, |s| {
- Value::Int16Value(*s as i32)
- })
- }
- scalar::ScalarValue::Int32(val) => {
- create_proto_scalar(val, PrimitiveScalarType::Int32, |s| Value::Int32Value(*s))
- }
- scalar::ScalarValue::Int64(val) => {
- create_proto_scalar(val, PrimitiveScalarType::Int64, |s| Value::Int64Value(*s))
- }
- scalar::ScalarValue::UInt8(val) => {
- create_proto_scalar(val, PrimitiveScalarType::Uint8, |s| {
- Value::Uint8Value(*s as u32)
- })
- }
- scalar::ScalarValue::UInt16(val) => {
- create_proto_scalar(val, PrimitiveScalarType::Uint16, |s| {
- Value::Uint16Value(*s as u32)
- })
- }
- scalar::ScalarValue::UInt32(val) => {
- create_proto_scalar(val, PrimitiveScalarType::Uint32, |s| Value::Uint32Value(*s))
- }
- scalar::ScalarValue::UInt64(val) => {
- create_proto_scalar(val, PrimitiveScalarType::Uint64, |s| Value::Uint64Value(*s))
- }
- scalar::ScalarValue::Utf8(val) => {
- create_proto_scalar(val, PrimitiveScalarType::Utf8, |s| {
- Value::Utf8Value(s.to_owned())
- })
- }
- scalar::ScalarValue::LargeUtf8(val) => {
- create_proto_scalar(val, PrimitiveScalarType::LargeUtf8, |s| {
- Value::LargeUtf8Value(s.to_owned())
- })
- }
- scalar::ScalarValue::List(value, datatype) => {
- println!("Current datatype of list: {:?}", datatype);
- match value {
- Some(values) => {
- if values.is_empty() {
- protobuf::ScalarValue {
- value: Some(protobuf::scalar_value::Value::ListValue(
- protobuf::ScalarListValue {
- datatype: Some(datatype.try_into()?),
- values: Vec::new(),
- },
- )),
- }
- } else {
- let scalar_type = match datatype {
- DataType::List(field) => field.as_ref().data_type(),
- _ => todo!("Proper error handling"),
- };
- println!("Current scalar type for list: {:?}", scalar_type);
- let type_checked_values: Vec<protobuf::ScalarValue> = values
- .iter()
- .map(|scalar| match (scalar, scalar_type) {
- (scalar::ScalarValue::List(_, arrow::datatypes::DataType::List(list_field)), arrow::datatypes::DataType::List(field)) => {
- let scalar_datatype = field.data_type();
- let list_datatype = list_field.data_type();
- if std::mem::discriminant(list_datatype) != std::mem::discriminant(scalar_datatype) {
- return Err(proto_error(format!(
- "Protobuf serialization error: Lists with inconsistent typing {:?} and {:?} found within list",
- list_datatype, scalar_datatype
- )));
- }
- scalar.try_into()
- }
- (scalar::ScalarValue::Boolean(_), arrow::datatypes::DataType::Boolean) => scalar.try_into(),
- (scalar::ScalarValue::Float32(_), arrow::datatypes::DataType::Float32) => scalar.try_into(),
- (scalar::ScalarValue::Float64(_), arrow::datatypes::DataType::Float64) => scalar.try_into(),
- (scalar::ScalarValue::Int8(_), arrow::datatypes::DataType::Int8) => scalar.try_into(),
- (scalar::ScalarValue::Int16(_), arrow::datatypes::DataType::Int16) => scalar.try_into(),
- (scalar::ScalarValue::Int32(_), arrow::datatypes::DataType::Int32) => scalar.try_into(),
- (scalar::ScalarValue::Int64(_), arrow::datatypes::DataType::Int64) => scalar.try_into(),
- (scalar::ScalarValue::UInt8(_), arrow::datatypes::DataType::UInt8) => scalar.try_into(),
- (scalar::ScalarValue::UInt16(_), arrow::datatypes::DataType::UInt16) => scalar.try_into(),
- (scalar::ScalarValue::UInt32(_), arrow::datatypes::DataType::UInt32) => scalar.try_into(),
- (scalar::ScalarValue::UInt64(_), arrow::datatypes::DataType::UInt64) => scalar.try_into(),
- (scalar::ScalarValue::Utf8(_), arrow::datatypes::DataType::Utf8) => scalar.try_into(),
- (scalar::ScalarValue::LargeUtf8(_), arrow::datatypes::DataType::LargeUtf8) => scalar.try_into(),
- _ => Err(proto_error(format!(
- "Protobuf serialization error, {:?} was inconsistent with designated type {:?}",
- scalar, datatype
- ))),
- })
- .collect::<Result<Vec<_>, _>>()?;
- protobuf::ScalarValue {
- value: Some(protobuf::scalar_value::Value::ListValue(
- protobuf::ScalarListValue {
- datatype: Some(datatype.try_into()?),
- values: type_checked_values,
- },
- )),
- }
- }
- }
- None => protobuf::ScalarValue {
- value: Some(protobuf::scalar_value::Value::NullListValue(
- datatype.try_into()?,
- )),
- },
- }
- }
- datafusion::scalar::ScalarValue::Date32(val) => {
- create_proto_scalar(val, PrimitiveScalarType::Date32, |s| Value::Date32Value(*s))
- }
- datafusion::scalar::ScalarValue::TimestampMicrosecond(val) => {
- create_proto_scalar(val, PrimitiveScalarType::TimeMicrosecond, |s| {
- Value::TimeMicrosecondValue(*s)
- })
- }
- datafusion::scalar::ScalarValue::TimestampNanosecond(val) => {
- create_proto_scalar(val, PrimitiveScalarType::TimeNanosecond, |s| {
- Value::TimeNanosecondValue(*s)
- })
- }
- _ => {
- return Err(proto_error(format!(
- "Error converting to Datatype to scalar type, {:?} is invalid as a datafusion scalar.",
- val
- )))
- }
- };
- Ok(scalar_val)
- }
-}
-
-impl TryInto<protobuf::LogicalPlanNode> for &LogicalPlan {
- type Error = BallistaError;
-
- fn try_into(self) -> Result<protobuf::LogicalPlanNode, Self::Error> {
- use protobuf::logical_plan_node::LogicalPlanType;
- match self {
- LogicalPlan::TableScan {
- table_name,
- source,
- filters,
- projection,
- ..
- } => {
- let schema = source.schema();
-
- // unwrap the DFTableAdapter to get to the real TableProvider
- let source = if let Some(adapter) =
- source.as_any().downcast_ref::<DFTableAdapter>()
- {
- match &adapter.logical_plan {
- LogicalPlan::TableScan { source, .. } => Ok(source.as_any()),
- _ => Err(BallistaError::General(
- "Invalid LogicalPlan::TableScan".to_owned(),
- )),
- }
- } else {
- Ok(source.as_any())
- }?;
-
- let projection = match projection {
- None => None,
- Some(columns) => {
- let column_names = columns
- .iter()
- .map(|i| schema.field(*i).name().to_owned())
- .collect();
- Some(protobuf::ProjectionColumns {
- columns: column_names,
- })
- }
- };
- let schema: protobuf::Schema = schema.as_ref().into();
-
- let filters: Vec<protobuf::LogicalExprNode> = filters
- .iter()
- .map(|filter| filter.try_into())
- .collect::<Result<Vec<_>, _>>()?;
-
- if let Some(parquet) = source.downcast_ref::<ParquetTable>() {
- Ok(protobuf::LogicalPlanNode {
- logical_plan_type: Some(LogicalPlanType::ParquetScan(
- protobuf::ParquetTableScanNode {
- table_name: table_name.to_owned(),
- path: parquet.path().to_owned(),
- projection,
- schema: Some(schema),
- filters,
- },
- )),
- })
- } else if let Some(csv) = source.downcast_ref::<CsvFile>() {
- let delimiter = [csv.delimiter()];
- let delimiter = std::str::from_utf8(&delimiter).map_err(|_| {
- BallistaError::General("Invalid CSV delimiter".to_owned())
- })?;
- Ok(protobuf::LogicalPlanNode {
- logical_plan_type: Some(LogicalPlanType::CsvScan(
- protobuf::CsvTableScanNode {
- table_name: table_name.to_owned(),
- path: csv.path().to_owned(),
- projection,
- schema: Some(schema),
- has_header: csv.has_header(),
- delimiter: delimiter.to_string(),
- file_extension: csv.file_extension().to_string(),
- filters,
- },
- )),
- })
- } else {
- Err(BallistaError::General(format!(
- "logical plan to_proto unsupported table provider {:?}",
- source
- )))
- }
- }
- LogicalPlan::Projection { expr, input, .. } => {
- Ok(protobuf::LogicalPlanNode {
- logical_plan_type: Some(LogicalPlanType::Projection(Box::new(
- protobuf::ProjectionNode {
- input: Some(Box::new(input.as_ref().try_into()?)),
- expr: expr
- .iter()
- .map(|expr| expr.try_into())
- .collect::<Result<Vec<_>, BallistaError>>()?,
- },
- ))),
- })
- }
- LogicalPlan::Filter { predicate, input } => {
- let input: protobuf::LogicalPlanNode = input.as_ref().try_into()?;
- Ok(protobuf::LogicalPlanNode {
- logical_plan_type: Some(LogicalPlanType::Selection(Box::new(
- protobuf::SelectionNode {
- input: Some(Box::new(input)),
- expr: Some(predicate.try_into()?),
- },
- ))),
- })
- }
- LogicalPlan::Aggregate {
- input,
- group_expr,
- aggr_expr,
- ..
- } => {
- let input: protobuf::LogicalPlanNode = input.as_ref().try_into()?;
- Ok(protobuf::LogicalPlanNode {
- logical_plan_type: Some(LogicalPlanType::Aggregate(Box::new(
- protobuf::AggregateNode {
- input: Some(Box::new(input)),
- group_expr: group_expr
- .iter()
- .map(|expr| expr.try_into())
- .collect::<Result<Vec<_>, BallistaError>>()?,
- aggr_expr: aggr_expr
- .iter()
- .map(|expr| expr.try_into())
- .collect::<Result<Vec<_>, BallistaError>>()?,
- },
- ))),
- })
- }
- LogicalPlan::Join {
- left,
- right,
- on,
- join_type,
- ..
- } => {
- let left: protobuf::LogicalPlanNode = left.as_ref().try_into()?;
- let right: protobuf::LogicalPlanNode = right.as_ref().try_into()?;
- let join_type = match join_type {
- JoinType::Inner => protobuf::JoinType::Inner,
- JoinType::Left => protobuf::JoinType::Left,
- JoinType::Right => protobuf::JoinType::Right,
- };
- let left_join_column = on.iter().map(|on| on.0.to_owned()).collect();
- let right_join_column = on.iter().map(|on| on.1.to_owned()).collect();
- Ok(protobuf::LogicalPlanNode {
- logical_plan_type: Some(LogicalPlanType::Join(Box::new(
- protobuf::JoinNode {
- left: Some(Box::new(left)),
- right: Some(Box::new(right)),
- join_type: join_type.into(),
- left_join_column,
- right_join_column,
- },
- ))),
- })
- }
- LogicalPlan::Limit { input, n } => {
- let input: protobuf::LogicalPlanNode = input.as_ref().try_into()?;
- Ok(protobuf::LogicalPlanNode {
- logical_plan_type: Some(LogicalPlanType::Limit(Box::new(
- protobuf::LimitNode {
- input: Some(Box::new(input)),
- limit: *n as u32,
- },
- ))),
- })
- }
- LogicalPlan::Sort { input, expr } => {
- let input: protobuf::LogicalPlanNode = input.as_ref().try_into()?;
- let selection_expr: Vec<protobuf::LogicalExprNode> = expr
- .iter()
- .map(|expr| expr.try_into())
- .collect::<Result<Vec<_>, BallistaError>>()?;
- Ok(protobuf::LogicalPlanNode {
- logical_plan_type: Some(LogicalPlanType::Sort(Box::new(
- protobuf::SortNode {
- input: Some(Box::new(input)),
- expr: selection_expr,
- },
- ))),
- })
- }
- LogicalPlan::Repartition {
- input,
- partitioning_scheme,
- } => {
- use datafusion::logical_plan::Partitioning;
- let input: protobuf::LogicalPlanNode = input.as_ref().try_into()?;
-
- //Assumed common usize field was batch size
- //Used u64 to avoid any nastyness involving large values, most data clusters are probably uniformly 64 bits any ways
- use protobuf::repartition_node::PartitionMethod;
-
- let pb_partition_method = match partitioning_scheme {
- Partitioning::Hash(exprs, partition_count) => {
- PartitionMethod::Hash(protobuf::HashRepartition {
- hash_expr: exprs
- .iter()
- .map(|expr| expr.try_into())
- .collect::<Result<Vec<_>, BallistaError>>()?,
- partition_count: *partition_count as u64,
- })
- }
- Partitioning::RoundRobinBatch(batch_size) => {
- PartitionMethod::RoundRobin(*batch_size as u64)
- }
- };
-
- Ok(protobuf::LogicalPlanNode {
- logical_plan_type: Some(LogicalPlanType::Repartition(Box::new(
- protobuf::RepartitionNode {
- input: Some(Box::new(input)),
- partition_method: Some(pb_partition_method),
- },
- ))),
- })
- }
- LogicalPlan::EmptyRelation {
- produce_one_row, ..
- } => Ok(protobuf::LogicalPlanNode {
- logical_plan_type: Some(LogicalPlanType::EmptyRelation(
- protobuf::EmptyRelationNode {
- produce_one_row: *produce_one_row,
- },
- )),
- }),
- LogicalPlan::CreateExternalTable {
- name,
- location,
- file_type,
- has_header,
- schema: df_schema,
- } => {
- use datafusion::sql::parser::FileType;
- let schema: Schema = df_schema.as_ref().clone().into();
- let pb_schema: protobuf::Schema = (&schema).try_into().map_err(|e| {
- BallistaError::General(format!(
- "Could not convert schema into protobuf: {:?}",
- e
- ))
- })?;
-
- let pb_file_type: protobuf::FileType = match file_type {
- FileType::NdJson => protobuf::FileType::NdJson,
- FileType::Parquet => protobuf::FileType::Parquet,
- FileType::CSV => protobuf::FileType::Csv,
- };
-
- Ok(protobuf::LogicalPlanNode {
- logical_plan_type: Some(LogicalPlanType::CreateExternalTable(
- protobuf::CreateExternalTableNode {
- name: name.clone(),
- location: location.clone(),
- file_type: pb_file_type as i32,
- has_header: *has_header,
- schema: Some(pb_schema),
- },
- )),
- })
- }
- LogicalPlan::Explain { verbose, plan, .. } => {
- let input: protobuf::LogicalPlanNode = plan.as_ref().try_into()?;
- Ok(protobuf::LogicalPlanNode {
- logical_plan_type: Some(LogicalPlanType::Explain(Box::new(
- protobuf::ExplainNode {
- input: Some(Box::new(input)),
- verbose: *verbose,
- },
- ))),
- })
- }
- LogicalPlan::Extension { .. } => unimplemented!(),
- LogicalPlan::Union { .. } => unimplemented!(),
- }
- }
-}
-
-fn create_proto_scalar<I, T: FnOnce(&I) -> protobuf::scalar_value::Value>(
- v: &Option<I>,
- null_arrow_type: protobuf::PrimitiveScalarType,
- constructor: T,
-) -> protobuf::ScalarValue {
- protobuf::ScalarValue {
- value: Some(v.as_ref().map(constructor).unwrap_or(
- protobuf::scalar_value::Value::NullValue(null_arrow_type as i32),
- )),
- }
-}
-
-impl TryInto<protobuf::LogicalExprNode> for &Expr {
- type Error = BallistaError;
-
- fn try_into(self) -> Result<protobuf::LogicalExprNode, Self::Error> {
- use datafusion::scalar::ScalarValue;
- use protobuf::scalar_value::Value;
- match self {
- Expr::Column(name) => {
- let expr = protobuf::LogicalExprNode {
- expr_type: Some(ExprType::ColumnName(name.clone())),
- };
- Ok(expr)
- }
- Expr::Alias(expr, alias) => {
- let alias = Box::new(protobuf::AliasNode {
- expr: Some(Box::new(expr.as_ref().try_into()?)),
- alias: alias.to_owned(),
- });
- let expr = protobuf::LogicalExprNode {
- expr_type: Some(ExprType::Alias(alias)),
- };
- Ok(expr)
- }
- Expr::Literal(value) => {
- let pb_value: protobuf::ScalarValue = value.try_into()?;
- Ok(protobuf::LogicalExprNode {
- expr_type: Some(ExprType::Literal(pb_value)),
- })
- }
- Expr::BinaryExpr { left, op, right } => {
- let binary_expr = Box::new(protobuf::BinaryExprNode {
- l: Some(Box::new(left.as_ref().try_into()?)),
- r: Some(Box::new(right.as_ref().try_into()?)),
- op: format!("{:?}", op),
- });
- Ok(protobuf::LogicalExprNode {
- expr_type: Some(ExprType::BinaryExpr(binary_expr)),
- })
- }
- Expr::AggregateFunction {
- ref fun, ref args, ..
- } => {
- let aggr_function = match fun {
- AggregateFunction::Min => protobuf::AggregateFunction::Min,
- AggregateFunction::Max => protobuf::AggregateFunction::Max,
- AggregateFunction::Sum => protobuf::AggregateFunction::Sum,
- AggregateFunction::Avg => protobuf::AggregateFunction::Avg,
- AggregateFunction::Count => protobuf::AggregateFunction::Count,
- };
-
- let arg = &args[0];
- let aggregate_expr = Box::new(protobuf::AggregateExprNode {
- aggr_function: aggr_function.into(),
- expr: Some(Box::new(arg.try_into()?)),
- });
- Ok(protobuf::LogicalExprNode {
- expr_type: Some(ExprType::AggregateExpr(aggregate_expr)),
- })
- }
- Expr::ScalarVariable(_) => unimplemented!(),
- Expr::ScalarFunction { ref fun, ref args } => {
- let fun: protobuf::ScalarFunction = fun.try_into()?;
- let expr: Vec<protobuf::LogicalExprNode> = args
- .iter()
- .map(|e| Ok(e.try_into()?))
- .collect::<Result<Vec<protobuf::LogicalExprNode>, BallistaError>>()?;
- Ok(protobuf::LogicalExprNode {
- expr_type: Some(
- protobuf::logical_expr_node::ExprType::ScalarFunction(
- protobuf::ScalarFunctionNode {
- fun: fun.into(),
- expr,
- },
- ),
- ),
- })
- }
- Expr::ScalarUDF { .. } => unimplemented!(),
- Expr::AggregateUDF { .. } => unimplemented!(),
- Expr::Not(expr) => {
- let expr = Box::new(protobuf::Not {
- expr: Some(Box::new(expr.as_ref().try_into()?)),
- });
- Ok(protobuf::LogicalExprNode {
- expr_type: Some(ExprType::NotExpr(expr)),
- })
- }
- Expr::IsNull(expr) => {
- let expr = Box::new(protobuf::IsNull {
- expr: Some(Box::new(expr.as_ref().try_into()?)),
- });
- Ok(protobuf::LogicalExprNode {
- expr_type: Some(ExprType::IsNullExpr(expr)),
- })
- }
- Expr::IsNotNull(expr) => {
- let expr = Box::new(protobuf::IsNotNull {
- expr: Some(Box::new(expr.as_ref().try_into()?)),
- });
- Ok(protobuf::LogicalExprNode {
- expr_type: Some(ExprType::IsNotNullExpr(expr)),
- })
- }
- Expr::Between {
- expr,
- negated,
- low,
- high,
- } => {
- let expr = Box::new(protobuf::BetweenNode {
- expr: Some(Box::new(expr.as_ref().try_into()?)),
- negated: *negated,
- low: Some(Box::new(low.as_ref().try_into()?)),
- high: Some(Box::new(high.as_ref().try_into()?)),
- });
- Ok(protobuf::LogicalExprNode {
- expr_type: Some(ExprType::Between(expr)),
- })
- }
- Expr::Case {
- expr,
- when_then_expr,
- else_expr,
- } => {
- let when_then_expr = when_then_expr
- .iter()
- .map(|(w, t)| {
- Ok(protobuf::WhenThen {
- when_expr: Some(w.as_ref().try_into()?),
- then_expr: Some(t.as_ref().try_into()?),
- })
- })
- .collect::<Result<Vec<protobuf::WhenThen>, BallistaError>>()?;
- let expr = Box::new(protobuf::CaseNode {
- expr: match expr {
- Some(e) => Some(Box::new(e.as_ref().try_into()?)),
- None => None,
- },
- when_then_expr,
- else_expr: match else_expr {
- Some(e) => Some(Box::new(e.as_ref().try_into()?)),
- None => None,
- },
- });
- Ok(protobuf::LogicalExprNode {
- expr_type: Some(ExprType::Case(expr)),
- })
- }
- Expr::Cast { expr, data_type } => {
- let expr = Box::new(protobuf::CastNode {
- expr: Some(Box::new(expr.as_ref().try_into()?)),
- arrow_type: Some(data_type.into()),
- });
- Ok(protobuf::LogicalExprNode {
- expr_type: Some(ExprType::Cast(expr)),
- })
- }
- Expr::Sort {
- expr,
- asc,
- nulls_first,
- } => {
- let expr = Box::new(protobuf::SortExprNode {
- expr: Some(Box::new(expr.as_ref().try_into()?)),
- asc: *asc,
- nulls_first: *nulls_first,
- });
- Ok(protobuf::LogicalExprNode {
- expr_type: Some(ExprType::Sort(expr)),
- })
- }
- Expr::Negative(expr) => {
- let expr = Box::new(protobuf::NegativeNode {
- expr: Some(Box::new(expr.as_ref().try_into()?)),
- });
- Ok(protobuf::LogicalExprNode {
- expr_type: Some(protobuf::logical_expr_node::ExprType::Negative(
- expr,
- )),
- })
- }
- Expr::InList {
- expr,
- list,
- negated,
- } => {
- let expr = Box::new(protobuf::InListNode {
- expr: Some(Box::new(expr.as_ref().try_into()?)),
- list: list.iter().map(|expr| expr.try_into()).collect::<Result<
- Vec<_>,
- BallistaError,
- >>(
- )?,
- negated: *negated,
- });
- Ok(protobuf::LogicalExprNode {
- expr_type: Some(protobuf::logical_expr_node::ExprType::InList(expr)),
- })
- }
- Expr::Wildcard => Ok(protobuf::LogicalExprNode {
- expr_type: Some(protobuf::logical_expr_node::ExprType::Wildcard(true)),
- }),
- Expr::TryCast { .. } => unimplemented!(),
- }
- }
-}
-
-impl Into<protobuf::Schema> for &Schema {
- fn into(self) -> protobuf::Schema {
- protobuf::Schema {
- columns: self
- .fields()
- .iter()
- .map(protobuf::Field::from)
- .collect::<Vec<_>>(),
- }
- }
-}
-
-impl TryFrom<&arrow::datatypes::DataType> for protobuf::ScalarType {
- type Error = BallistaError;
- fn try_from(value: &arrow::datatypes::DataType) -> Result<Self, Self::Error> {
- let datatype = protobuf::scalar_type::Datatype::try_from(value)?;
- Ok(protobuf::ScalarType {
- datatype: Some(datatype),
- })
- }
-}
-
-impl TryInto<protobuf::ScalarFunction> for &BuiltinScalarFunction {
- type Error = BallistaError;
- fn try_into(self) -> Result<protobuf::ScalarFunction, Self::Error> {
- match self {
- BuiltinScalarFunction::Sqrt => Ok(protobuf::ScalarFunction::Sqrt),
- BuiltinScalarFunction::Sin => Ok(protobuf::ScalarFunction::Sin),
- BuiltinScalarFunction::Cos => Ok(protobuf::ScalarFunction::Cos),
- BuiltinScalarFunction::Tan => Ok(protobuf::ScalarFunction::Tan),
- BuiltinScalarFunction::Asin => Ok(protobuf::ScalarFunction::Asin),
- BuiltinScalarFunction::Acos => Ok(protobuf::ScalarFunction::Acos),
- BuiltinScalarFunction::Atan => Ok(protobuf::ScalarFunction::Atan),
- BuiltinScalarFunction::Exp => Ok(protobuf::ScalarFunction::Exp),
- BuiltinScalarFunction::Log => Ok(protobuf::ScalarFunction::Log),
- BuiltinScalarFunction::Log10 => Ok(protobuf::ScalarFunction::Log10),
- BuiltinScalarFunction::Floor => Ok(protobuf::ScalarFunction::Floor),
- BuiltinScalarFunction::Ceil => Ok(protobuf::ScalarFunction::Ceil),
- BuiltinScalarFunction::Round => Ok(protobuf::ScalarFunction::Round),
- BuiltinScalarFunction::Trunc => Ok(protobuf::ScalarFunction::Trunc),
- BuiltinScalarFunction::Abs => Ok(protobuf::ScalarFunction::Abs),
- BuiltinScalarFunction::OctetLength => {
- Ok(protobuf::ScalarFunction::Octetlength)
- }
- BuiltinScalarFunction::Concat => Ok(protobuf::ScalarFunction::Concat),
- BuiltinScalarFunction::Lower => Ok(protobuf::ScalarFunction::Lower),
- BuiltinScalarFunction::Upper => Ok(protobuf::ScalarFunction::Upper),
- BuiltinScalarFunction::Trim => Ok(protobuf::ScalarFunction::Trim),
- BuiltinScalarFunction::Ltrim => Ok(protobuf::ScalarFunction::Ltrim),
- BuiltinScalarFunction::Rtrim => Ok(protobuf::ScalarFunction::Rtrim),
- BuiltinScalarFunction::ToTimestamp => {
- Ok(protobuf::ScalarFunction::Totimestamp)
- }
- BuiltinScalarFunction::Array => Ok(protobuf::ScalarFunction::Array),
- BuiltinScalarFunction::NullIf => Ok(protobuf::ScalarFunction::Nullif),
- BuiltinScalarFunction::DateTrunc => Ok(protobuf::ScalarFunction::Datetrunc),
- BuiltinScalarFunction::MD5 => Ok(protobuf::ScalarFunction::Md5),
- BuiltinScalarFunction::SHA224 => Ok(protobuf::ScalarFunction::Sha224),
- BuiltinScalarFunction::SHA256 => Ok(protobuf::ScalarFunction::Sha256),
- BuiltinScalarFunction::SHA384 => Ok(protobuf::ScalarFunction::Sha384),
- BuiltinScalarFunction::SHA512 => Ok(protobuf::ScalarFunction::Sha512),
- _ => Err(BallistaError::General(format!(
- "logical_plan::to_proto() unsupported scalar function {:?}",
- self
- ))),
- }
- }
-}
diff --git a/rust/ballista/rust/core/src/serde/mod.rs b/rust/ballista/rust/core/src/serde/mod.rs
deleted file mode 100644
index b961639..0000000
--- a/rust/ballista/rust/core/src/serde/mod.rs
+++ /dev/null
@@ -1,69 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! This crate contains code generated from the Ballista Protocol Buffer Definition as well
-//! as convenience code for interacting with the generated code.
-
-use std::{convert::TryInto, io::Cursor};
-
-use crate::{error::BallistaError, serde::scheduler::Action as BallistaAction};
-
-use prost::Message;
-
-// include the generated protobuf source as a submodule
-#[allow(clippy::all)]
-pub mod protobuf {
- include!(concat!(env!("OUT_DIR"), "/ballista.protobuf.rs"));
-}
-
-pub mod logical_plan;
-pub mod physical_plan;
-pub mod scheduler;
-
-pub fn decode_protobuf(bytes: &[u8]) -> Result<BallistaAction, BallistaError> {
- let mut buf = Cursor::new(bytes);
-
- protobuf::Action::decode(&mut buf)
- .map_err(|e| BallistaError::Internal(format!("{:?}", e)))
- .and_then(|node| node.try_into())
-}
-
-pub(crate) fn proto_error<S: Into<String>>(message: S) -> BallistaError {
- BallistaError::General(message.into())
-}
-
-#[macro_export]
-macro_rules! convert_required {
- ($PB:expr) => {{
- if let Some(field) = $PB.as_ref() {
- field.try_into()
- } else {
- Err(proto_error("Missing required field in protobuf"))
- }
- }};
-}
-
-#[macro_export]
-macro_rules! convert_box_required {
- ($PB:expr) => {{
- if let Some(field) = $PB.as_ref() {
- field.as_ref().try_into()
- } else {
- Err(proto_error("Missing required field in protobuf"))
- }
- }};
-}
diff --git a/rust/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/rust/ballista/rust/core/src/serde/physical_plan/from_proto.rs
deleted file mode 100644
index be0777d..0000000
--- a/rust/ballista/rust/core/src/serde/physical_plan/from_proto.rs
+++ /dev/null
@@ -1,398 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Serde code to convert from protocol buffers to Rust data structures.
-
-use std::collections::HashMap;
-use std::convert::TryInto;
-use std::sync::Arc;
-
-use crate::error::BallistaError;
-use crate::execution_plans::{ShuffleReaderExec, UnresolvedShuffleExec};
-use crate::serde::protobuf::repartition_exec_node::PartitionMethod;
-use crate::serde::protobuf::LogicalExprNode;
-use crate::serde::scheduler::PartitionLocation;
-use crate::serde::{proto_error, protobuf};
-use crate::{convert_box_required, convert_required};
-
-use arrow::datatypes::{DataType, Schema, SchemaRef};
-use datafusion::catalog::catalog::{
- CatalogList, CatalogProvider, MemoryCatalogList, MemoryCatalogProvider,
-};
-use datafusion::execution::context::{ExecutionConfig, ExecutionContextState};
-use datafusion::logical_plan::{DFSchema, Expr};
-use datafusion::physical_plan::aggregates::{create_aggregate_expr, AggregateFunction};
-use datafusion::physical_plan::expressions::col;
-use datafusion::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec};
-use datafusion::physical_plan::hash_join::PartitionMode;
-use datafusion::physical_plan::merge::MergeExec;
-use datafusion::physical_plan::planner::DefaultPhysicalPlanner;
-use datafusion::physical_plan::{
- coalesce_batches::CoalesceBatchesExec,
- csv::CsvExec,
- empty::EmptyExec,
- expressions::{Avg, Column, PhysicalSortExpr},
- filter::FilterExec,
- hash_join::HashJoinExec,
- hash_utils::JoinType,
- limit::{GlobalLimitExec, LocalLimitExec},
- parquet::ParquetExec,
- projection::ProjectionExec,
- repartition::RepartitionExec,
- sort::{SortExec, SortOptions},
- Partitioning,
-};
-use datafusion::physical_plan::{AggregateExpr, ExecutionPlan, PhysicalExpr};
-use datafusion::prelude::CsvReadOptions;
-use log::debug;
-use protobuf::logical_expr_node::ExprType;
-use protobuf::physical_plan_node::PhysicalPlanType;
-
-impl TryInto<Arc<dyn ExecutionPlan>> for &protobuf::PhysicalPlanNode {
- type Error = BallistaError;
-
- fn try_into(self) -> Result<Arc<dyn ExecutionPlan>, Self::Error> {
- let plan = self.physical_plan_type.as_ref().ok_or_else(|| {
- proto_error(format!(
- "physical_plan::from_proto() Unsupported physical plan '{:?}'",
- self
- ))
- })?;
- match plan {
- PhysicalPlanType::Projection(projection) => {
- let input: Arc<dyn ExecutionPlan> =
- convert_box_required!(projection.input)?;
- let exprs = projection
- .expr
- .iter()
- .zip(projection.expr_name.iter())
- .map(|(expr, name)| {
- compile_expr(expr, &input.schema()).map(|e| (e, name.to_string()))
- })
- .collect::<Result<Vec<_>, _>>()?;
- Ok(Arc::new(ProjectionExec::try_new(exprs, input)?))
- }
- PhysicalPlanType::Filter(filter) => {
- let input: Arc<dyn ExecutionPlan> = convert_box_required!(filter.input)?;
- let predicate = compile_expr(
- filter.expr.as_ref().ok_or_else(|| {
- BallistaError::General(
- "filter (FilterExecNode) in PhysicalPlanNode is missing."
- .to_owned(),
- )
- })?,
- &input.schema(),
- )?;
- Ok(Arc::new(FilterExec::try_new(predicate, input)?))
- }
- PhysicalPlanType::CsvScan(scan) => {
- let schema = Arc::new(convert_required!(scan.schema)?);
- let options = CsvReadOptions::new()
- .has_header(scan.has_header)
- .file_extension(&scan.file_extension)
- .delimiter(scan.delimiter.as_bytes()[0])
- .schema(&schema);
- let projection = scan.projection.iter().map(|i| *i as usize).collect();
- Ok(Arc::new(CsvExec::try_new(
- &scan.path,
- options,
- Some(projection),
- scan.batch_size as usize,
- None,
- )?))
- }
- PhysicalPlanType::ParquetScan(scan) => {
- let projection = scan.projection.iter().map(|i| *i as usize).collect();
- let filenames: Vec<&str> =
- scan.filename.iter().map(|s| s.as_str()).collect();
- Ok(Arc::new(ParquetExec::try_from_files(
- &filenames,
- Some(projection),
- None,
- scan.batch_size as usize,
- scan.num_partitions as usize,
- None,
- )?))
- }
- PhysicalPlanType::CoalesceBatches(coalesce_batches) => {
- let input: Arc<dyn ExecutionPlan> =
- convert_box_required!(coalesce_batches.input)?;
- Ok(Arc::new(CoalesceBatchesExec::new(
- input,
- coalesce_batches.target_batch_size as usize,
- )))
- }
- PhysicalPlanType::Merge(merge) => {
- let input: Arc<dyn ExecutionPlan> = convert_box_required!(merge.input)?;
- Ok(Arc::new(MergeExec::new(input)))
- }
- PhysicalPlanType::Repartition(repart) => {
- let input: Arc<dyn ExecutionPlan> = convert_box_required!(repart.input)?;
- match repart.partition_method {
- Some(PartitionMethod::Hash(ref hash_part)) => {
- let expr = hash_part
- .hash_expr
- .iter()
- .map(|e| compile_expr(e, &input.schema()))
- .collect::<Result<Vec<Arc<dyn PhysicalExpr>>, _>>()?;
-
- Ok(Arc::new(RepartitionExec::try_new(
- input,
- Partitioning::Hash(
- expr,
- hash_part.partition_count.try_into().unwrap(),
- ),
- )?))
- }
- Some(PartitionMethod::RoundRobin(partition_count)) => {
- Ok(Arc::new(RepartitionExec::try_new(
- input,
- Partitioning::RoundRobinBatch(
- partition_count.try_into().unwrap(),
- ),
- )?))
- }
- Some(PartitionMethod::Unknown(partition_count)) => {
- Ok(Arc::new(RepartitionExec::try_new(
- input,
- Partitioning::UnknownPartitioning(
- partition_count.try_into().unwrap(),
- ),
- )?))
- }
- _ => Err(BallistaError::General(
- "Invalid partitioning scheme".to_owned(),
- )),
- }
- }
- PhysicalPlanType::GlobalLimit(limit) => {
- let input: Arc<dyn ExecutionPlan> = convert_box_required!(limit.input)?;
- Ok(Arc::new(GlobalLimitExec::new(input, limit.limit as usize)))
- }
- PhysicalPlanType::LocalLimit(limit) => {
- let input: Arc<dyn ExecutionPlan> = convert_box_required!(limit.input)?;
- Ok(Arc::new(LocalLimitExec::new(input, limit.limit as usize)))
- }
- PhysicalPlanType::HashAggregate(hash_agg) => {
- let input: Arc<dyn ExecutionPlan> =
- convert_box_required!(hash_agg.input)?;
- let mode = protobuf::AggregateMode::from_i32(hash_agg.mode).ok_or_else(|| {
- proto_error(format!(
- "Received a HashAggregateNode message with unknown AggregateMode {}",
- hash_agg.mode
- ))
- })?;
- let agg_mode: AggregateMode = match mode {
- protobuf::AggregateMode::Partial => AggregateMode::Partial,
- protobuf::AggregateMode::Final => AggregateMode::Final,
- };
-
- let group = hash_agg
- .group_expr
- .iter()
- .zip(hash_agg.group_expr_name.iter())
- .map(|(expr, name)| {
- compile_expr(expr, &input.schema()).map(|e| (e, name.to_string()))
- })
- .collect::<Result<Vec<_>, _>>()?;
-
- let logical_agg_expr: Vec<(Expr, String)> = hash_agg
- .aggr_expr
- .iter()
- .zip(hash_agg.aggr_expr_name.iter())
- .map(|(expr, name)| expr.try_into().map(|expr| (expr, name.clone())))
- .collect::<Result<Vec<_>, _>>()?;
-
- let df_planner = DefaultPhysicalPlanner::default();
- let catalog_list =
- Arc::new(MemoryCatalogList::new()) as Arc<dyn CatalogList>;
- let ctx_state = ExecutionContextState {
- catalog_list,
- scalar_functions: Default::default(),
- var_provider: Default::default(),
- aggregate_functions: Default::default(),
- config: ExecutionConfig::new(),
- };
-
- let input_schema = hash_agg
- .input_schema
- .as_ref()
- .ok_or_else(|| {
- BallistaError::General(
- "input_schema in HashAggregateNode is missing.".to_owned(),
- )
- })?
- .clone();
- let physical_schema: SchemaRef =
- SchemaRef::new((&input_schema).try_into()?);
-
- let mut physical_aggr_expr = vec![];
-
- for (expr, name) in &logical_agg_expr {
- match expr {
- Expr::AggregateFunction { fun, args, .. } => {
- let arg = df_planner
- .create_physical_expr(
- &args[0],
- &physical_schema,
- &ctx_state,
- )
- .map_err(|e| {
- BallistaError::General(format!("{:?}", e))
- })?;
- physical_aggr_expr.push(create_aggregate_expr(
- &fun,
- false,
- &[arg],
- &physical_schema,
- name.to_string(),
- )?);
- }
- _ => {
- return Err(BallistaError::General(
- "Invalid expression for HashAggregateExec".to_string(),
- ))
- }
- }
- }
- Ok(Arc::new(HashAggregateExec::try_new(
- agg_mode,
- group,
- physical_aggr_expr,
- input,
- Arc::new((&input_schema).try_into()?),
- )?))
- }
- PhysicalPlanType::HashJoin(hashjoin) => {
- let left: Arc<dyn ExecutionPlan> = convert_box_required!(hashjoin.left)?;
- let right: Arc<dyn ExecutionPlan> =
- convert_box_required!(hashjoin.right)?;
- let on: Vec<(String, String)> = hashjoin
- .on
- .iter()
- .map(|col| (col.left.clone(), col.right.clone()))
- .collect();
- let join_type = protobuf::JoinType::from_i32(hashjoin.join_type)
- .ok_or_else(|| {
- proto_error(format!(
- "Received a HashJoinNode message with unknown JoinType {}",
- hashjoin.join_type
- ))
- })?;
- let join_type = match join_type {
- protobuf::JoinType::Inner => JoinType::Inner,
- protobuf::JoinType::Left => JoinType::Left,
- protobuf::JoinType::Right => JoinType::Right,
- };
- Ok(Arc::new(HashJoinExec::try_new(
- left,
- right,
- &on,
- &join_type,
- PartitionMode::CollectLeft,
- )?))
- }
- PhysicalPlanType::ShuffleReader(shuffle_reader) => {
- let schema = Arc::new(convert_required!(shuffle_reader.schema)?);
- let partition_location: Vec<PartitionLocation> = shuffle_reader
- .partition_location
- .iter()
- .map(|p| p.clone().try_into())
- .collect::<Result<Vec<_>, BallistaError>>()?;
- let shuffle_reader =
- ShuffleReaderExec::try_new(partition_location, schema)?;
- Ok(Arc::new(shuffle_reader))
- }
- PhysicalPlanType::Empty(empty) => {
- let schema = Arc::new(convert_required!(empty.schema)?);
- Ok(Arc::new(EmptyExec::new(empty.produce_one_row, schema)))
- }
- PhysicalPlanType::Sort(sort) => {
- let input: Arc<dyn ExecutionPlan> = convert_box_required!(sort.input)?;
- let exprs = sort
- .expr
- .iter()
- .map(|expr| {
- let expr = expr.expr_type.as_ref().ok_or_else(|| {
- proto_error(format!(
- "physical_plan::from_proto() Unexpected expr {:?}",
- self
- ))
- })?;
- if let protobuf::logical_expr_node::ExprType::Sort(sort_expr) = expr {
- let expr = sort_expr
- .expr
- .as_ref()
- .ok_or_else(|| {
- proto_error(format!(
- "physical_plan::from_proto() Unexpected sort expr {:?}",
- self
- ))
- })?
- .as_ref();
- Ok(PhysicalSortExpr {
- expr: compile_expr(expr, &input.schema())?,
- options: SortOptions {
- descending: !sort_expr.asc,
- nulls_first: sort_expr.nulls_first,
- },
- })
- } else {
- Err(BallistaError::General(format!(
- "physical_plan::from_proto() {:?}",
- self
- )))
- }
- })
- .collect::<Result<Vec<_>, _>>()?;
- // Update concurrency here in the future
- Ok(Arc::new(SortExec::try_new(exprs, input)?))
- }
- PhysicalPlanType::Unresolved(unresolved_shuffle) => {
- let schema = Arc::new(convert_required!(unresolved_shuffle.schema)?);
- Ok(Arc::new(UnresolvedShuffleExec {
- query_stage_ids: unresolved_shuffle
- .query_stage_ids
- .iter()
- .map(|id| *id as usize)
- .collect(),
- schema,
- partition_count: unresolved_shuffle.partition_count as usize,
- }))
- }
- }
- }
-}
-
-fn compile_expr(
- expr: &protobuf::LogicalExprNode,
- schema: &Schema,
-) -> Result<Arc<dyn PhysicalExpr>, BallistaError> {
- let df_planner = DefaultPhysicalPlanner::default();
- let catalog_list = Arc::new(MemoryCatalogList::new()) as Arc<dyn CatalogList>;
- let state = ExecutionContextState {
- catalog_list,
- scalar_functions: HashMap::new(),
- var_provider: HashMap::new(),
- aggregate_functions: HashMap::new(),
- config: ExecutionConfig::new(),
- };
- let expr: Expr = expr.try_into()?;
- df_planner
- .create_physical_expr(&expr, schema, &state)
- .map_err(|e| BallistaError::General(format!("{:?}", e)))
-}
diff --git a/rust/ballista/rust/core/src/serde/physical_plan/mod.rs b/rust/ballista/rust/core/src/serde/physical_plan/mod.rs
deleted file mode 100644
index e7985cc..0000000
--- a/rust/ballista/rust/core/src/serde/physical_plan/mod.rs
+++ /dev/null
@@ -1,178 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-pub mod from_proto;
-pub mod to_proto;
-
-#[cfg(test)]
-mod roundtrip_tests {
- use datafusion::physical_plan::hash_utils::JoinType;
- use std::{convert::TryInto, sync::Arc};
-
- use arrow::datatypes::{DataType, Schema};
- use datafusion::physical_plan::ColumnarValue;
- use datafusion::physical_plan::{
- empty::EmptyExec,
- expressions::{Avg, Column, PhysicalSortExpr},
- hash_aggregate::{AggregateMode, HashAggregateExec},
- hash_join::HashJoinExec,
- limit::{GlobalLimitExec, LocalLimitExec},
- sort::SortExec,
- ExecutionPlan,
- };
- use datafusion::physical_plan::{
- AggregateExpr, Distribution, Partitioning, PhysicalExpr,
- };
-
- use super::super::super::error::Result;
- use super::super::protobuf;
- use datafusion::physical_plan::hash_join::PartitionMode;
-
- fn roundtrip_test(exec_plan: Arc<dyn ExecutionPlan>) -> Result<()> {
- let proto: protobuf::PhysicalPlanNode = exec_plan.clone().try_into()?;
- let result_exec_plan: Arc<dyn ExecutionPlan> = (&proto).try_into()?;
- assert_eq!(
- format!("{:?}", exec_plan),
- format!("{:?}", result_exec_plan)
- );
- Ok(())
- }
-
- #[test]
- fn roundtrip_empty() -> Result<()> {
- roundtrip_test(Arc::new(EmptyExec::new(false, Arc::new(Schema::empty()))))
- }
-
- #[test]
- fn roundtrip_local_limit() -> Result<()> {
- roundtrip_test(Arc::new(LocalLimitExec::new(
- Arc::new(EmptyExec::new(false, Arc::new(Schema::empty()))),
- 25,
- )))
- }
-
- #[test]
- fn roundtrip_global_limit() -> Result<()> {
- roundtrip_test(Arc::new(GlobalLimitExec::new(
- Arc::new(EmptyExec::new(false, Arc::new(Schema::empty()))),
- 25,
- )))
- }
-
- #[test]
- fn roundtrip_hash_join() -> Result<()> {
- use arrow::datatypes::{DataType, Field, Schema};
- let field_a = Field::new("col", DataType::Int64, false);
- let schema_left = Schema::new(vec![field_a.clone()]);
- let schema_right = Schema::new(vec![field_a]);
-
- roundtrip_test(Arc::new(HashJoinExec::try_new(
- Arc::new(EmptyExec::new(false, Arc::new(schema_left))),
- Arc::new(EmptyExec::new(false, Arc::new(schema_right))),
- &[("col".to_string(), "col".to_string())],
- &JoinType::Inner,
- PartitionMode::CollectLeft,
- )?))
- }
-
- fn col(name: &str) -> Arc<dyn PhysicalExpr> {
- Arc::new(Column::new(name))
- }
-
- #[test]
- fn rountrip_hash_aggregate() -> Result<()> {
- use arrow::datatypes::{DataType, Field, Schema};
- let groups: Vec<(Arc<dyn PhysicalExpr>, String)> =
- vec![(col("a"), "unused".to_string())];
-
... 70207 lines suppressed ...