You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ho...@apache.org on 2021/09/25 22:32:51 UTC
[arrow-datafusion] branch master updated: Avoid stack overflow by
reducing stack usage of `BinaryExpr::evaluate` in debug builds (#1047)
This is an automated email from the ASF dual-hosted git repository.
houqp pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new 26399ed Avoid stack overflow by reducing stack usage of `BinaryExpr::evaluate` in debug builds (#1047)
26399ed is described below
commit 26399ed2405dcacd505f4af73c5ecf608cb6aa9d
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Sat Sep 25 18:32:13 2021 -0400
Avoid stack overflow by reducing stack usage of `BinaryExpr::evaluate` in debug builds (#1047)
* Test for stack overflow
* Remove STACK SIZE workaround
* Move out a bit more into a different function
* move more
* Increase tree depth to 100 to test
---
.github/workflows/rust.yml | 2 -
datafusion/src/physical_plan/expressions/binary.rs | 224 ++++++++++++++-------
2 files changed, 146 insertions(+), 80 deletions(-)
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index d62b996..370b988 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -105,8 +105,6 @@ jobs:
run: |
export ARROW_TEST_DATA=$(pwd)/testing/data
export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data
- # run tests on all workspace members with default feature list + avro
- RUST_MIN_STACK=10485760 cargo test --features=avro
# test datafusion examples
cd datafusion-examples
cargo test --no-default-features
diff --git a/datafusion/src/physical_plan/expressions/binary.rs b/datafusion/src/physical_plan/expressions/binary.rs
index e77b25c..d58b2ed 100644
--- a/datafusion/src/physical_plan/expressions/binary.rs
+++ b/datafusion/src/physical_plan/expressions/binary.rs
@@ -543,86 +543,17 @@ impl PhysicalExpr for BinaryExpr {
)));
}
+ // Attempt to use special kernels if one input is scalar and the other is an array
let scalar_result = match (&left_value, &right_value) {
(ColumnarValue::Array(array), ColumnarValue::Scalar(scalar)) => {
// if left is array and right is literal - use scalar operations
- match &self.op {
- Operator::Lt => binary_array_op_scalar!(array, scalar.clone(), lt),
- Operator::LtEq => {
- binary_array_op_scalar!(array, scalar.clone(), lt_eq)
- }
- Operator::Gt => binary_array_op_scalar!(array, scalar.clone(), gt),
- Operator::GtEq => {
- binary_array_op_scalar!(array, scalar.clone(), gt_eq)
- }
- Operator::Eq => binary_array_op_scalar!(array, scalar.clone(), eq),
- Operator::NotEq => {
- binary_array_op_scalar!(array, scalar.clone(), neq)
- }
- Operator::Like => {
- binary_string_array_op_scalar!(array, scalar.clone(), like)
- }
- Operator::NotLike => {
- binary_string_array_op_scalar!(array, scalar.clone(), nlike)
- }
- Operator::Divide => {
- binary_primitive_array_op_scalar!(array, scalar.clone(), divide)
- }
- Operator::Modulo => {
- binary_primitive_array_op_scalar!(array, scalar.clone(), modulus)
- }
- Operator::RegexMatch => binary_string_array_flag_op_scalar!(
- array,
- scalar.clone(),
- regexp_is_match,
- false,
- false
- ),
- Operator::RegexIMatch => binary_string_array_flag_op_scalar!(
- array,
- scalar.clone(),
- regexp_is_match,
- false,
- true
- ),
- Operator::RegexNotMatch => binary_string_array_flag_op_scalar!(
- array,
- scalar.clone(),
- regexp_is_match,
- true,
- false
- ),
- Operator::RegexNotIMatch => binary_string_array_flag_op_scalar!(
- array,
- scalar.clone(),
- regexp_is_match,
- true,
- true
- ),
- // if scalar operation is not supported - fallback to array implementation
- _ => None,
- }
+ self.evaluate_array_scalar(array, scalar)?
}
(ColumnarValue::Scalar(scalar), ColumnarValue::Array(array)) => {
// if right is literal and left is array - reverse operator and parameters
- match &self.op {
- Operator::Lt => binary_array_op_scalar!(array, scalar.clone(), gt),
- Operator::LtEq => {
- binary_array_op_scalar!(array, scalar.clone(), gt_eq)
- }
- Operator::Gt => binary_array_op_scalar!(array, scalar.clone(), lt),
- Operator::GtEq => {
- binary_array_op_scalar!(array, scalar.clone(), lt_eq)
- }
- Operator::Eq => binary_array_op_scalar!(array, scalar.clone(), eq),
- Operator::NotEq => {
- binary_array_op_scalar!(array, scalar.clone(), neq)
- }
- // if scalar operation is not supported - fallback to array implementation
- _ => None,
- }
+ self.evaluate_scalar_array(scalar, array)?
}
- (_, _) => None,
+ (_, _) => None, // default to array implementation
};
if let Some(result) = scalar_result {
@@ -634,8 +565,113 @@ impl PhysicalExpr for BinaryExpr {
left_value.into_array(batch.num_rows()),
right_value.into_array(batch.num_rows()),
);
+ self.evaluate_with_resolved_args(left, &left_data_type, right, &right_data_type)
+ .map(|a| ColumnarValue::Array(a))
+ }
+}
+
+impl BinaryExpr {
+ /// Evaluate the expression of the left input is an array and
+ /// right is literal - use scalar operations
+ fn evaluate_array_scalar(
+ &self,
+ array: &ArrayRef,
+ scalar: &ScalarValue,
+ ) -> Result<Option<Result<ArrayRef>>> {
+ let scalar_result = match &self.op {
+ Operator::Lt => binary_array_op_scalar!(array, scalar.clone(), lt),
+ Operator::LtEq => {
+ binary_array_op_scalar!(array, scalar.clone(), lt_eq)
+ }
+ Operator::Gt => binary_array_op_scalar!(array, scalar.clone(), gt),
+ Operator::GtEq => {
+ binary_array_op_scalar!(array, scalar.clone(), gt_eq)
+ }
+ Operator::Eq => binary_array_op_scalar!(array, scalar.clone(), eq),
+ Operator::NotEq => {
+ binary_array_op_scalar!(array, scalar.clone(), neq)
+ }
+ Operator::Like => {
+ binary_string_array_op_scalar!(array, scalar.clone(), like)
+ }
+ Operator::NotLike => {
+ binary_string_array_op_scalar!(array, scalar.clone(), nlike)
+ }
+ Operator::Divide => {
+ binary_primitive_array_op_scalar!(array, scalar.clone(), divide)
+ }
+ Operator::Modulo => {
+ binary_primitive_array_op_scalar!(array, scalar.clone(), modulus)
+ }
+ Operator::RegexMatch => binary_string_array_flag_op_scalar!(
+ array,
+ scalar.clone(),
+ regexp_is_match,
+ false,
+ false
+ ),
+ Operator::RegexIMatch => binary_string_array_flag_op_scalar!(
+ array,
+ scalar.clone(),
+ regexp_is_match,
+ false,
+ true
+ ),
+ Operator::RegexNotMatch => binary_string_array_flag_op_scalar!(
+ array,
+ scalar.clone(),
+ regexp_is_match,
+ true,
+ false
+ ),
+ Operator::RegexNotIMatch => binary_string_array_flag_op_scalar!(
+ array,
+ scalar.clone(),
+ regexp_is_match,
+ true,
+ true
+ ),
+ // if scalar operation is not supported - fallback to array implementation
+ _ => None,
+ };
- let result: Result<ArrayRef> = match &self.op {
+ Ok(scalar_result)
+ }
+
+ /// Evaluate the expression if the left input is a literal and the
+ /// right is an array - reverse operator and parameters
+ fn evaluate_scalar_array(
+ &self,
+ scalar: &ScalarValue,
+ array: &ArrayRef,
+ ) -> Result<Option<Result<ArrayRef>>> {
+ let scalar_result = match &self.op {
+ Operator::Lt => binary_array_op_scalar!(array, scalar.clone(), gt),
+ Operator::LtEq => {
+ binary_array_op_scalar!(array, scalar.clone(), gt_eq)
+ }
+ Operator::Gt => binary_array_op_scalar!(array, scalar.clone(), lt),
+ Operator::GtEq => {
+ binary_array_op_scalar!(array, scalar.clone(), lt_eq)
+ }
+ Operator::Eq => binary_array_op_scalar!(array, scalar.clone(), eq),
+ Operator::NotEq => {
+ binary_array_op_scalar!(array, scalar.clone(), neq)
+ }
+ // if scalar operation is not supported - fallback to array implementation
+ _ => None,
+ };
+ Ok(scalar_result)
+ }
+
+ fn evaluate_with_resolved_args(
+ &self,
+ left: Arc<dyn Array>,
+ left_data_type: &DataType,
+ right: Arc<dyn Array>,
+ right_data_type: &DataType,
+ ) -> Result<ArrayRef> {
+ match &self.op {
Operator::Like => binary_string_array_op!(left, right, like),
Operator::NotLike => binary_string_array_op!(left, right, nlike),
Operator::Lt => binary_array_op!(left, right, lt),
@@ -650,7 +686,7 @@ impl PhysicalExpr for BinaryExpr {
Operator::Divide => binary_primitive_array_op!(left, right, divide),
Operator::Modulo => binary_primitive_array_op!(left, right, modulus),
Operator::And => {
- if left_data_type == DataType::Boolean {
+ if left_data_type == &DataType::Boolean {
boolean_op!(left, right, and_kleene)
} else {
return Err(DataFusionError::Internal(format!(
@@ -662,7 +698,7 @@ impl PhysicalExpr for BinaryExpr {
}
}
Operator::Or => {
- if left_data_type == DataType::Boolean {
+ if left_data_type == &DataType::Boolean {
boolean_op!(left, right, or_kleene)
} else {
return Err(DataFusionError::Internal(format!(
@@ -683,8 +719,7 @@ impl PhysicalExpr for BinaryExpr {
Operator::RegexNotIMatch => {
binary_string_array_flag_op!(left, right, regexp_is_match, true, true)
}
- };
- result.map(|a| ColumnarValue::Array(a))
+ }
}
}
@@ -1381,4 +1416,37 @@ mod tests {
))
}
}
+
+ #[test]
+ fn relatively_deeply_nested() {
+ // Reproducer for https://github.com/apache/arrow-datafusion/issues/419
+
+ // where even relatively shallow binary expressions overflowed
+ // the stack in debug builds
+
+ let input: Vec<_> = vec![1, 2, 3, 4, 5].into_iter().map(Some).collect();
+ let a: Int32Array = input.iter().collect();
+
+ let batch = RecordBatch::try_from_iter(vec![("a", Arc::new(a) as _)]).unwrap();
+ let schema = batch.schema();
+
+ // build a left deep tree ((((a + a) + a) + a ....
+ let tree_depth: i32 = 100;
+ let expr = (0..tree_depth)
+ .into_iter()
+ .map(|_| col("a", schema.as_ref()).unwrap())
+ .reduce(|l, r| binary_simple(l, Operator::Plus, r))
+ .unwrap();
+
+ let result = expr
+ .evaluate(&batch)
+ .expect("evaluation")
+ .into_array(batch.num_rows());
+
+ let expected: Int32Array = input
+ .into_iter()
+ .map(|i| i.map(|i| i * tree_depth))
+ .collect();
+ assert_eq!(result.as_ref(), &expected);
+ }
}