You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/01/24 11:41:28 UTC

[arrow-datafusion] branch master updated: Add dictionary_expresions feature (#4386) (#4999)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new 9f498bbc8 Add dictionary_expresions feature (#4386) (#4999)
9f498bbc8 is described below

commit 9f498bbc8ec5bd0326abe30fe2ffa081a199c904
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Tue Jan 24 11:41:19 2023 +0000

    Add dictionary_expresions feature (#4386) (#4999)
    
    * Add dictionary_expresions feature (#4386)
    
    * Toml format
---
 .github/workflows/rust.yml                         | 4 ++--
 datafusion/core/Cargo.toml                         | 4 +++-
 datafusion/core/tests/path_partition.rs            | 2 +-
 datafusion/core/tests/sql/select.rs                | 1 +
 datafusion/physical-expr/Cargo.toml                | 7 +++++--
 datafusion/physical-expr/src/expressions/binary.rs | 1 +
 6 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index cbc541886..e73e52cba 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -95,11 +95,11 @@ jobs:
       - name: Build tests
         run: |
           export PATH=$PATH:$HOME/d/protoc/bin
-          cargo test --features avro,jit,scheduler,json --no-run
+          cargo test --features avro,jit,scheduler,json,dictionary_expressions --no-run
       - name: Run tests
         run: |
           export PATH=$PATH:$HOME/d/protoc/bin
-          cargo test --features avro,jit,scheduler,json
+          cargo test --features avro,jit,scheduler,json,dictionary_expressions
       - name: Run examples
         run: |
           export PATH=$PATH:$HOME/d/protoc/bin
diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
index becc733eb..00ca39f47 100644
--- a/datafusion/core/Cargo.toml
+++ b/datafusion/core/Cargo.toml
@@ -43,6 +43,9 @@ avro = ["apache-avro", "num-traits", "datafusion-common/avro"]
 compression = ["xz2", "bzip2", "flate2", "async-compression"]
 crypto_expressions = ["datafusion-physical-expr/crypto_expressions"]
 default = ["crypto_expressions", "regex_expressions", "unicode_expressions", "compression"]
+# Enables support for non-scalar, binary operations on dictionaries
+# Note: this results in significant additional codegen
+dictionary_expressions = ["datafusion-physical-expr/dictionary_expressions"]
 # Used for testing ONLY: causes all values to hash to the same value (test for collisions)
 force_hash_collisions = []
 # Used to enable JIT code generation
@@ -102,7 +105,6 @@ xz2 = { version = "0.1", optional = true }
 
 
 [dev-dependencies]
-arrow = { version = "31.0.0", features = ["prettyprint", "dyn_cmp_dict"] }
 async-trait = "0.1.53"
 bigdecimal = "0.3.0"
 criterion = "0.4"
diff --git a/datafusion/core/tests/path_partition.rs b/datafusion/core/tests/path_partition.rs
index 2d257d49a..670b508c3 100644
--- a/datafusion/core/tests/path_partition.rs
+++ b/datafusion/core/tests/path_partition.rs
@@ -204,7 +204,7 @@ async fn csv_filter_with_file_col() -> Result<()> {
     );
 
     let result = ctx
-        .sql("SELECT c1, c2 FROM t WHERE date='2021-10-27' and date!=c1 LIMIT 5")
+        .sql("SELECT c1, c2 FROM t WHERE date='2021-10-27' and c1!='2021-10-27' LIMIT 5")
         .await?
         .collect()
         .await?;
diff --git a/datafusion/core/tests/sql/select.rs b/datafusion/core/tests/sql/select.rs
index f65835101..124f25d36 100644
--- a/datafusion/core/tests/sql/select.rs
+++ b/datafusion/core/tests/sql/select.rs
@@ -621,6 +621,7 @@ async fn query_nested_get_indexed_field_on_struct() -> Result<()> {
 }
 
 #[tokio::test]
+#[cfg(feature = "dictionary_expressions")]
 async fn query_on_string_dictionary() -> Result<()> {
     // Test to ensure DataFusion can operate on dictionary types
     // Use StringDictionary (32 bit indexes = keys)
diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml
index 5b25d9707..6ab1bb7fa 100644
--- a/datafusion/physical-expr/Cargo.toml
+++ b/datafusion/physical-expr/Cargo.toml
@@ -24,7 +24,7 @@ repository = "https://github.com/apache/arrow-datafusion"
 readme = "README.md"
 authors = ["Apache Arrow <de...@arrow.apache.org>"]
 license = "Apache-2.0"
-keywords = [ "arrow", "query", "sql" ]
+keywords = ["arrow", "query", "sql"]
 edition = "2021"
 rust-version = "1.62"
 
@@ -35,12 +35,15 @@ path = "src/lib.rs"
 [features]
 crypto_expressions = ["md-5", "sha2", "blake2", "blake3"]
 default = ["crypto_expressions", "regex_expressions", "unicode_expressions"]
+# Enables support for non-scalar, binary operations on dictionaries
+# Note: this results in significant additional codegen
+dictionary_expressions = ["arrow/dyn_cmp_dict", "arrow/dyn_arith_dict"]
 regex_expressions = ["regex"]
 unicode_expressions = ["unicode-segmentation"]
 
 [dependencies]
 ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] }
-arrow = { version = "31.0.0", features = ["prettyprint", "dyn_cmp_dict"] }
+arrow = { version = "31.0.0", features = ["prettyprint"] }
 arrow-buffer = "31.0.0"
 arrow-schema = "31.0.0"
 blake2 = { version = "^0.10.2", optional = true }
diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs
index df90be163..d2346d278 100644
--- a/datafusion/physical-expr/src/expressions/binary.rs
+++ b/datafusion/physical-expr/src/expressions/binary.rs
@@ -1502,6 +1502,7 @@ mod tests {
     // is no way at the time of this writing to create a dictionary
     // array using the `From` trait
     #[test]
+    #[cfg(feature = "dictionary_expressions")]
     fn test_dictionary_type_to_array_coersion() -> Result<()> {
         // Test string  a string dictionary
         let dict_type =