You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/01/24 11:41:28 UTC
[arrow-datafusion] branch master updated: Add dictionary_expresions feature (#4386) (#4999)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new 9f498bbc8 Add dictionary_expresions feature (#4386) (#4999)
9f498bbc8 is described below
commit 9f498bbc8ec5bd0326abe30fe2ffa081a199c904
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Tue Jan 24 11:41:19 2023 +0000
Add dictionary_expresions feature (#4386) (#4999)
* Add dictionary_expresions feature (#4386)
* Toml format
---
.github/workflows/rust.yml | 4 ++--
datafusion/core/Cargo.toml | 4 +++-
datafusion/core/tests/path_partition.rs | 2 +-
datafusion/core/tests/sql/select.rs | 1 +
datafusion/physical-expr/Cargo.toml | 7 +++++--
datafusion/physical-expr/src/expressions/binary.rs | 1 +
6 files changed, 13 insertions(+), 6 deletions(-)
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index cbc541886..e73e52cba 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -95,11 +95,11 @@ jobs:
- name: Build tests
run: |
export PATH=$PATH:$HOME/d/protoc/bin
- cargo test --features avro,jit,scheduler,json --no-run
+ cargo test --features avro,jit,scheduler,json,dictionary_expressions --no-run
- name: Run tests
run: |
export PATH=$PATH:$HOME/d/protoc/bin
- cargo test --features avro,jit,scheduler,json
+ cargo test --features avro,jit,scheduler,json,dictionary_expressions
- name: Run examples
run: |
export PATH=$PATH:$HOME/d/protoc/bin
diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
index becc733eb..00ca39f47 100644
--- a/datafusion/core/Cargo.toml
+++ b/datafusion/core/Cargo.toml
@@ -43,6 +43,9 @@ avro = ["apache-avro", "num-traits", "datafusion-common/avro"]
compression = ["xz2", "bzip2", "flate2", "async-compression"]
crypto_expressions = ["datafusion-physical-expr/crypto_expressions"]
default = ["crypto_expressions", "regex_expressions", "unicode_expressions", "compression"]
+# Enables support for non-scalar, binary operations on dictionaries
+# Note: this results in significant additional codegen
+dictionary_expressions = ["datafusion-physical-expr/dictionary_expressions"]
# Used for testing ONLY: causes all values to hash to the same value (test for collisions)
force_hash_collisions = []
# Used to enable JIT code generation
@@ -102,7 +105,6 @@ xz2 = { version = "0.1", optional = true }
[dev-dependencies]
-arrow = { version = "31.0.0", features = ["prettyprint", "dyn_cmp_dict"] }
async-trait = "0.1.53"
bigdecimal = "0.3.0"
criterion = "0.4"
diff --git a/datafusion/core/tests/path_partition.rs b/datafusion/core/tests/path_partition.rs
index 2d257d49a..670b508c3 100644
--- a/datafusion/core/tests/path_partition.rs
+++ b/datafusion/core/tests/path_partition.rs
@@ -204,7 +204,7 @@ async fn csv_filter_with_file_col() -> Result<()> {
);
let result = ctx
- .sql("SELECT c1, c2 FROM t WHERE date='2021-10-27' and date!=c1 LIMIT 5")
+ .sql("SELECT c1, c2 FROM t WHERE date='2021-10-27' and c1!='2021-10-27' LIMIT 5")
.await?
.collect()
.await?;
diff --git a/datafusion/core/tests/sql/select.rs b/datafusion/core/tests/sql/select.rs
index f65835101..124f25d36 100644
--- a/datafusion/core/tests/sql/select.rs
+++ b/datafusion/core/tests/sql/select.rs
@@ -621,6 +621,7 @@ async fn query_nested_get_indexed_field_on_struct() -> Result<()> {
}
#[tokio::test]
+#[cfg(feature = "dictionary_expressions")]
async fn query_on_string_dictionary() -> Result<()> {
// Test to ensure DataFusion can operate on dictionary types
// Use StringDictionary (32 bit indexes = keys)
diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml
index 5b25d9707..6ab1bb7fa 100644
--- a/datafusion/physical-expr/Cargo.toml
+++ b/datafusion/physical-expr/Cargo.toml
@@ -24,7 +24,7 @@ repository = "https://github.com/apache/arrow-datafusion"
readme = "README.md"
authors = ["Apache Arrow <de...@arrow.apache.org>"]
license = "Apache-2.0"
-keywords = [ "arrow", "query", "sql" ]
+keywords = ["arrow", "query", "sql"]
edition = "2021"
rust-version = "1.62"
@@ -35,12 +35,15 @@ path = "src/lib.rs"
[features]
crypto_expressions = ["md-5", "sha2", "blake2", "blake3"]
default = ["crypto_expressions", "regex_expressions", "unicode_expressions"]
+# Enables support for non-scalar, binary operations on dictionaries
+# Note: this results in significant additional codegen
+dictionary_expressions = ["arrow/dyn_cmp_dict", "arrow/dyn_arith_dict"]
regex_expressions = ["regex"]
unicode_expressions = ["unicode-segmentation"]
[dependencies]
ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] }
-arrow = { version = "31.0.0", features = ["prettyprint", "dyn_cmp_dict"] }
+arrow = { version = "31.0.0", features = ["prettyprint"] }
arrow-buffer = "31.0.0"
arrow-schema = "31.0.0"
blake2 = { version = "^0.10.2", optional = true }
diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs
index df90be163..d2346d278 100644
--- a/datafusion/physical-expr/src/expressions/binary.rs
+++ b/datafusion/physical-expr/src/expressions/binary.rs
@@ -1502,6 +1502,7 @@ mod tests {
// is no way at the time of this writing to create a dictionary
// array using the `From` trait
#[test]
+ #[cfg(feature = "dictionary_expressions")]
fn test_dictionary_type_to_array_coersion() -> Result<()> {
// Test string a string dictionary
let dict_type =