You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/08/03 19:52:53 UTC

[GitHub] [arrow-datafusion] alamb commented on a change in pull request #813: Speed up inlist for strings and primitives

alamb commented on a change in pull request #813:
URL: https://github.com/apache/arrow-datafusion/pull/813#discussion_r682053016



##########
File path: datafusion/src/physical_plan/expressions/in_list.rs
##########
@@ -99,6 +124,104 @@ macro_rules! make_contains {
     }};
 }
 
+macro_rules! make_contains_primitive {
+    ($ARRAY:expr, $LIST_VALUES:expr, $NEGATED:expr, $SCALAR_VALUE:ident, $ARRAY_TYPE:ident) => {{
+        let array = $ARRAY.as_any().downcast_ref::<$ARRAY_TYPE>().unwrap();
+
+        let mut contains_null = false;
+        let values = $LIST_VALUES
+            .iter()
+            .flat_map(|expr| match expr {
+                ColumnarValue::Scalar(s) => match s {

Review comment:
       Given this code just seems to pass along the values,  wonder if it would be clearer / faster here to look for nulls using the `ScalarValue::is_null()`, perhaps like
   
   ```rust
   let contains_nulls = $LIST_VALUES.iter().any(|expr| expr.is_null());
   ```
   
   And that way you could stop when you hit the first one as well as potentially avoid a collect into a new `Vec`

##########
File path: datafusion/src/physical_plan/expressions/in_list.rs
##########
@@ -234,16 +363,40 @@ impl PhysicalExpr for InListExpr {
 
         match value_data_type {
             DataType::Float32 => {
-                make_contains!(array, list_values, self.negated, Float32, Float32Array)
+                make_contains_primitive!(
+                    array,
+                    list_values,
+                    self.negated,
+                    Float32,
+                    Float32Array
+                )
             }
             DataType::Float64 => {
-                make_contains!(array, list_values, self.negated, Float64, Float64Array)
+                make_contains_primitive!(
+                    array,
+                    list_values,
+                    self.negated,
+                    Float64,
+                    Float64Array
+                )
             }
             DataType::Int16 => {
-                make_contains!(array, list_values, self.negated, Int16, Int16Array)
+                make_contains_primitive!(
+                    array,
+                    list_values,
+                    self.negated,
+                    Int16,
+                    Int16Array
+                )
             }
             DataType::Int32 => {
-                make_contains!(array, list_values, self.negated, Int32, Int32Array)
+                make_contains_primitive!(
+                    array,
+                    list_values,
+                    self.negated,
+                    Int32,
+                    Int32Array
+                )
             }
             DataType::Int64 => {
                 make_contains!(array, list_values, self.negated, Int64, Int64Array)

Review comment:
       Is there a reason not to apply the same transformation to `Int64` and `Int8`?

##########
File path: datafusion/src/physical_plan/expressions/in_list.rs
##########
@@ -99,6 +124,104 @@ macro_rules! make_contains {
     }};
 }
 
+macro_rules! make_contains_primitive {
+    ($ARRAY:expr, $LIST_VALUES:expr, $NEGATED:expr, $SCALAR_VALUE:ident, $ARRAY_TYPE:ident) => {{
+        let array = $ARRAY.as_any().downcast_ref::<$ARRAY_TYPE>().unwrap();
+
+        let mut contains_null = false;
+        let values = $LIST_VALUES
+            .iter()
+            .flat_map(|expr| match expr {
+                ColumnarValue::Scalar(s) => match s {
+                    ScalarValue::$SCALAR_VALUE(Some(v)) => Some(*v),
+                    ScalarValue::$SCALAR_VALUE(None) => {
+                        contains_null = true;
+                        None
+                    }
+                    ScalarValue::Utf8(None) => {
+                        contains_null = true;
+                        None
+                    }
+                    datatype => unimplemented!("Unexpected type {} for InList", datatype),
+                },
+                ColumnarValue::Array(_) => {
+                    unimplemented!("InList does not yet support nested columns.")
+                }
+            })
+            .collect::<Vec<_>>();
+
+        if $NEGATED {
+            if contains_null {
+                Ok(ColumnarValue::Array(Arc::new(
+                    array
+                        .iter()
+                        .map(|x| match x.map(|v| !values.contains(&v)) {
+                            Some(true) => None,
+                            x => x,
+                        })
+                        .collect::<BooleanArray>(),
+                )))
+            } else {
+                Ok(ColumnarValue::Array(Arc::new(
+                    values_not_in_list_primitive(array, &values)?,
+                )))
+            }
+        } else {
+            if contains_null {
+                Ok(ColumnarValue::Array(Arc::new(
+                    array
+                        .iter()
+                        .map(|x| match x.map(|v| values.contains(&v)) {
+                            Some(false) => None,
+                            x => x,
+                        })
+                        .collect::<BooleanArray>(),
+                )))
+            } else {
+                Ok(ColumnarValue::Array(Arc::new(values_in_list_primitive(
+                    array, &values,
+                )?)))
+            }
+        }
+    }};
+}
+
+fn values_in_list_primitive<T: ArrowPrimitiveType>(

Review comment:
       I suggest adding some comments here noting these functions assume that there are no null values in the column

##########
File path: datafusion/src/physical_plan/expressions/in_list.rs
##########
@@ -270,9 +447,10 @@ impl PhysicalExpr for InListExpr {
             DataType::LargeUtf8 => {
                 self.compare_utf8::<i64>(array, list_values, self.negated)
             }
-            datatype => {
-                unimplemented!("InList does not support datatype {:?}.", datatype)
-            }
+            datatype => Result::Err(DataFusionError::NotImplemented(format!(
+                "InList does not support datatype {:?}.",

Review comment:
       👨‍🍳 👌 

##########
File path: datafusion/src/physical_plan/expressions/in_list.rs
##########
@@ -164,33 +287,39 @@ impl InListExpr {
             })
             .collect::<Vec<&str>>();
 
-        Ok(ColumnarValue::Array(Arc::new(
-            array
-                .iter()
-                .map(|x| {
-                    let contains = x.map(|x| values.contains(&x));
-                    match contains {
-                        Some(true) => {
-                            if negated {
-                                Some(false)
-                            } else {
-                                Some(true)
-                            }
-                        }
-                        Some(false) => {
-                            if contains_null {
-                                None
-                            } else if negated {
-                                Some(true)
-                            } else {
-                                Some(false)
-                            }
-                        }
-                        None => None,
-                    }
-                })
-                .collect::<BooleanArray>(),
-        )))
+        if negated {

Review comment:
       The same comment about finding NULL by checking for the first match applies to the code right above this as well




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org