You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2019/03/03 19:40:32 UTC
[GitHub] [spark] kiszk commented on a change in pull request #23171: [SPARK-26205][SQL] Optimize InSet Expression for bytes, shorts, ints, dates

kiszk commented on a change in pull request #23171: [SPARK-26205][SQL] Optimize InSet Expression for bytes, shorts, ints, dates
URL: https://github.com/apache/spark/pull/23171#discussion_r261882937
 
 

 ##########
 File path: sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
 ##########
 @@ -241,6 +242,52 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
     }
   }
 
+  test("SPARK-26205: Optimize InSet for bytes, shorts, ints, dates using switch statements") {
+    val byteValues = Set[Any](1.toByte, 2.toByte, Byte.MinValue, Byte.MaxValue)
+    val shortValues = Set[Any](-10.toShort, 20.toShort, Short.MinValue, Short.MaxValue)
+    val intValues = Set[Any](20, -100, 30, Int.MinValue, Int.MaxValue)
+    val dateValues = Set[Any](
+      CatalystTypeConverters.convertToCatalyst(Date.valueOf("2017-01-01")),
+      CatalystTypeConverters.convertToCatalyst(Date.valueOf("1950-01-02")))
+
+    def check(presentValue: Expression, absentValue: Expression, values: Set[Any]): Unit = {
+      require(presentValue.dataType == absentValue.dataType)
+
+      val nullLiteral = Literal(null, presentValue.dataType)
+
+      checkEvaluation(InSet(nullLiteral, values), expected = null)
+      checkEvaluation(InSet(nullLiteral, values + null), expected = null)
+      checkEvaluation(InSet(presentValue, values), expected = true)
+      checkEvaluation(InSet(presentValue, values + null), expected = true)
+      checkEvaluation(InSet(absentValue, values), expected = false)
+      checkEvaluation(InSet(absentValue, values + null), expected = null)
+    }
+
+    def checkAllTypes(): Unit = {
+      check(presentValue = Literal(2.toByte), absentValue = Literal(3.toByte), byteValues)
+      check(presentValue = Literal(Byte.MinValue), absentValue = Literal(5.toByte), byteValues)
+      check(presentValue = Literal(20.toShort), absentValue = Literal(-14.toShort), shortValues)
+      check(presentValue = Literal(Short.MaxValue), absentValue = Literal(30.toShort), shortValues)
+      check(presentValue = Literal(20), absentValue = Literal(-14), intValues)
+      check(presentValue = Literal(Int.MinValue), absentValue = Literal(2), intValues)
+      check(
+        presentValue = Literal(Date.valueOf("2017-01-01")),
+        absentValue = Literal(Date.valueOf("2017-01-02")),
+        dateValues)
+      check(
+        presentValue = Literal(Date.valueOf("1950-01-02")),
+        absentValue = Literal(Date.valueOf("2017-10-02")),
+        dateValues)
+    }
+
+    withSQLConf(SQLConf.OPTIMIZER_INSET_SWITCH_THRESHOLD.key -> "0") {
+      checkAllTypes()
+    }
+    withSQLConf(SQLConf.OPTIMIZER_INSET_SWITCH_THRESHOLD.key -> "20") {
+      checkAllTypes()
+    }
+  }
 
 Review comment:
   My question addressed what you are talking [here](https://github.com/apache/spark/pull/23171/files#r261253221). The current implementation can accept large int value (e.g. Integer.MAX) for `spark.sql.optimizer.inSetSwitchThreshold`. Thus, I am afraid switch code requires more than 64KB java byte code.
   If the option would have the appropriate upper limit, it is fine.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org