You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/05/03 14:33:20 UTC

[arrow-datafusion] branch master updated: Count distinct boolean (#230)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new b9b3d86  Count distinct boolean (#230)
b9b3d86 is described below

commit b9b3d863d7765ceccf92155da7ec2e5f28d8feee
Author: Patrick More <34...@users.noreply.github.com>
AuthorDate: Mon May 3 07:33:12 2021 -0700

    Count distinct boolean (#230)
    
    * Added boolean support for count distinct.
    
    * Added boolean support for COUNT DISTINCT
    
    * Corrected macro call
    
    * Added test for boolean COUNT DISTINCT
    
    * ran cargo fmt
    
    * Corrected test assertion for boolean COUNT DISTINCT
    
    * Fixed clippy warnings
    
    * fix cargo fmt
---
 .../src/physical_plan/distinct_expressions.rs      | 60 ++++++++++++++++++++--
 datafusion/src/scalar.rs                           |  1 +
 2 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/datafusion/src/physical_plan/distinct_expressions.rs b/datafusion/src/physical_plan/distinct_expressions.rs
index 8534e9c..1c93b5a 100644
--- a/datafusion/src/physical_plan/distinct_expressions.rs
+++ b/datafusion/src/physical_plan/distinct_expressions.rs
@@ -195,10 +195,9 @@ impl Accumulator for DistinctCountAccumulator {
 mod tests {
     use super::*;
 
-    use arrow::array::ArrayRef;
     use arrow::array::{
-        Int16Array, Int32Array, Int64Array, Int8Array, ListArray, UInt16Array,
-        UInt32Array, UInt64Array, UInt8Array,
+        ArrayRef, BooleanArray, Int16Array, Int32Array, Int64Array, Int8Array, ListArray,
+        UInt16Array, UInt32Array, UInt64Array, UInt8Array,
     };
     use arrow::array::{Int32Builder, ListBuilder, UInt64Builder};
     use arrow::datatypes::DataType;
@@ -397,6 +396,61 @@ mod tests {
     }
 
     #[test]
+    fn count_distinct_update_batch_boolean() -> Result<()> {
+        let get_count = |data: BooleanArray| -> Result<(Vec<Option<bool>>, u64)> {
+            let arrays = vec![Arc::new(data) as ArrayRef];
+            let (states, result) = run_update_batch(&arrays)?;
+            let mut state_vec = state_to_vec!(&states[0], Boolean, bool).unwrap();
+            state_vec.sort();
+            let count = match result {
+                ScalarValue::UInt64(c) => c.ok_or_else(|| {
+                    DataFusionError::Internal("Found None count".to_string())
+                }),
+                scalar => Err(DataFusionError::Internal(format!(
+                    "Found non Uint64 scalar value from count: {}",
+                    scalar
+                ))),
+            }?;
+            Ok((state_vec, count))
+        };
+
+        let zero_count_values = BooleanArray::from(Vec::<bool>::new());
+
+        let one_count_values = BooleanArray::from(vec![false, false]);
+        let one_count_values_with_null =
+            BooleanArray::from(vec![Some(true), Some(true), None, None]);
+
+        let two_count_values = BooleanArray::from(vec![true, false, true, false, true]);
+        let two_count_values_with_null = BooleanArray::from(vec![
+            Some(true),
+            Some(false),
+            None,
+            None,
+            Some(true),
+            Some(false),
+        ]);
+
+        assert_eq!(
+            get_count(zero_count_values)?,
+            (Vec::<Option<bool>>::new(), 0)
+        );
+        assert_eq!(get_count(one_count_values)?, (vec![Some(false)], 1));
+        assert_eq!(
+            get_count(one_count_values_with_null)?,
+            (vec![Some(true)], 1)
+        );
+        assert_eq!(
+            get_count(two_count_values)?,
+            (vec![Some(false), Some(true)], 2)
+        );
+        assert_eq!(
+            get_count(two_count_values_with_null)?,
+            (vec![Some(false), Some(true)], 2)
+        );
+        Ok(())
+    }
+
+    #[test]
     fn count_distinct_update_batch_all_nulls() -> Result<()> {
         let arrays = vec![Arc::new(Int32Array::from(
             vec![None, None, None, None] as Vec<Option<i32>>
diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs
index 833f707..6f03194 100644
--- a/datafusion/src/scalar.rs
+++ b/datafusion/src/scalar.rs
@@ -345,6 +345,7 @@ impl ScalarValue {
                 ),
             },
             ScalarValue::List(values, data_type) => Arc::new(match data_type {
+                DataType::Boolean => build_list!(BooleanBuilder, Boolean, values, size),
                 DataType::Int8 => build_list!(Int8Builder, Int8, values, size),
                 DataType::Int16 => build_list!(Int16Builder, Int16, values, size),
                 DataType::Int32 => build_list!(Int32Builder, Int32, values, size),