You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/05 20:10:50 UTC

[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #2156: Add an InSet as an optimized version for IN_LIST

alamb commented on code in PR #2156:
URL: https://github.com/apache/arrow-datafusion/pull/2156#discussion_r843210109


##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -32,13 +33,15 @@ use arrow::{
     record_batch::RecordBatch,
 };
 
-use crate::PhysicalExpr;
+use crate::{expressions, PhysicalExpr};
 use arrow::array::*;
 use arrow::buffer::{Buffer, MutableBuffer};
 use datafusion_common::ScalarValue;
 use datafusion_common::{DataFusionError, Result};
 use datafusion_expr::ColumnarValue;
 
+static OPTIMIZER_INSET_THRESHOLD: usize = 400;

Review Comment:
   ```suggestion
   /// Size at which to use a Set rather than Vec for `IN` / `NOT IN`
   /// Value chosen to be consistent with Spark
   static OPTIMIZER_INSET_THRESHOLD: usize = 400;
   ```



##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -318,7 +404,13 @@ impl InListExpr {
 impl std::fmt::Display for InListExpr {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         if self.negated {
-            write!(f, "{} NOT IN ({:?})", self.expr, self.list)
+            if self.set.is_some() {
+                write!(f, "Use In_Set{} NOT IN ({:?})", self.expr, self.list)
+            } else {
+                write!(f, "{} NOT IN ({:?})", self.expr, self.list)
+            }
+        } else if self.set.is_some() {
+            write!(f, "Use In_Set{} IN ({:?} use In_Set)", self.expr, self.list)

Review Comment:
   I wonder if this would be clearer
   
   ```suggestion
               if self.set.is_some() {
                   write!(f, "{} NOT IN (SET) ({:?})", self.expr, self.list)
               } else {
                   write!(f, "{} NOT IN ({:?})", self.expr, self.list)
               }
           } else if self.set.is_some() {
               write!(f, "Use {} IN (SET) ({:?})", self.expr, self.list)
   ```
   
   



##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -69,6 +72,23 @@ pub struct InListExpr {
     expr: Arc<dyn PhysicalExpr>,
     list: Vec<Arc<dyn PhysicalExpr>>,
     negated: bool,
+    set: Option<InSet>,
+}
+
+/// InSet
+#[derive(Debug)]
+pub struct InSet {
+    set: HashSet<ScalarValue>,

Review Comment:
   Filed https://github.com/apache/arrow-datafusion/issues/2165



##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -32,13 +33,15 @@ use arrow::{
     record_batch::RecordBatch,
 };
 
-use crate::PhysicalExpr;
+use crate::{expressions, PhysicalExpr};
 use arrow::array::*;
 use arrow::buffer::{Buffer, MutableBuffer};
 use datafusion_common::ScalarValue;
 use datafusion_common::{DataFusionError, Result};
 use datafusion_expr::ColumnarValue;
 
+static OPTIMIZER_INSET_THRESHOLD: usize = 400;

Review Comment:
   @Ted-Jiang  do you  happen to  have a link we could point to (as in I don't know how to verify this claim, I am just copying it from the PR comments)?



##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -342,119 +434,202 @@ impl PhysicalExpr for InListExpr {
     fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
         let value = self.expr.evaluate(batch)?;
         let value_data_type = value.data_type();
-        let list_values = self
-            .list
-            .iter()
-            .map(|expr| expr.evaluate(batch))
-            .collect::<Result<Vec<_>>>()?;
-
-        let array = match value {
-            ColumnarValue::Array(array) => array,
-            ColumnarValue::Scalar(scalar) => scalar.to_array(),
-        };
 
-        match value_data_type {
-            DataType::Float32 => {
-                make_contains_primitive!(
-                    array,
-                    list_values,
-                    self.negated,
-                    Float32,
-                    Float32Array
-                )
-            }
-            DataType::Float64 => {
-                make_contains_primitive!(
-                    array,
-                    list_values,
-                    self.negated,
-                    Float64,
-                    Float64Array
-                )
-            }
-            DataType::Int16 => {
-                make_contains_primitive!(
-                    array,
-                    list_values,
-                    self.negated,
-                    Int16,
-                    Int16Array
-                )
-            }
-            DataType::Int32 => {
-                make_contains_primitive!(
-                    array,
-                    list_values,
-                    self.negated,
-                    Int32,
-                    Int32Array
-                )
-            }
-            DataType::Int64 => {
-                make_contains_primitive!(
-                    array,
-                    list_values,
-                    self.negated,
-                    Int64,
-                    Int64Array
-                )
-            }
-            DataType::Int8 => {
-                make_contains_primitive!(
-                    array,
-                    list_values,
-                    self.negated,
-                    Int8,
-                    Int8Array
-                )
-            }
-            DataType::UInt16 => {
-                make_contains_primitive!(
-                    array,
-                    list_values,
-                    self.negated,
-                    UInt16,
-                    UInt16Array
-                )
-            }
-            DataType::UInt32 => {
-                make_contains_primitive!(
-                    array,
-                    list_values,
-                    self.negated,
-                    UInt32,
-                    UInt32Array
-                )
-            }
-            DataType::UInt64 => {
-                make_contains_primitive!(
-                    array,
-                    list_values,
-                    self.negated,
-                    UInt64,
-                    UInt64Array
-                )
-            }
-            DataType::UInt8 => {
-                make_contains_primitive!(
-                    array,
-                    list_values,
-                    self.negated,
-                    UInt8,
-                    UInt8Array
-                )
-            }
-            DataType::Boolean => {
-                make_contains!(array, list_values, self.negated, Boolean, BooleanArray)
-            }
-            DataType::Utf8 => self.compare_utf8::<i32>(array, list_values, self.negated),
-            DataType::LargeUtf8 => {
-                self.compare_utf8::<i64>(array, list_values, self.negated)
+        if let Some(in_set) = &self.set {
+            let array = match value {
+                ColumnarValue::Array(array) => array,
+                ColumnarValue::Scalar(scalar) => scalar.to_array(),

Review Comment:
   This is unfortunate -- turning a scalar into an array just to convert it back to a scalar if using InList
   
   I wonder if we can pass the columnar_value to `set_contains_with_negated` and only do the conversion when using the Vec (not the Set)?
   
   This probably doesn't really make any sort of performance difference for any case we care about, I just noticed it and thought I would mention it)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org