You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Yuming Wang (Jira)" <ji...@apache.org> on 2022/05/21 14:23:00 UTC
[jira] [Updated] (SPARK-39248) Decimal divide much slower than multiply

     [ https://issues.apache.org/jira/browse/SPARK-39248?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Yuming Wang updated SPARK-39248:
--------------------------------
    Description: 
How to reproduce this issue:

{code:scala}
    import org.apache.spark.benchmark.Benchmark

    val valuesPerIteration = 2880404L
    val dir = "/tmp/spark/benchmark"
    spark.range(2880404L).selectExpr("cast(id as DECIMAL(9,2)) as d").write.mode("Overwrite").parquet(dir)

    val benchmark = new Benchmark("Benchmark decimal", valuesPerIteration, minNumIters = 5)
    benchmark.addCase("d * 2 > 0") { _ =>
      spark.read.parquet(dir).where("d * 2 > 0").write.format("noop").mode("Overwrite").save()
    }

    benchmark.addCase("d / 2 > 0") { _ =>
      spark.read.parquet(dir).where("d / 2 > 0").write.format("noop").mode("Overwrite").save()
    }
    benchmark.run()
{code}

{noformat}
Java HotSpot(TM) 64-Bit Server VM 1.8.0_281-b09 on Mac OS X 10.15.7
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Benchmark decimal:                        Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
------------------------------------------------------------------------------------------------------------------------
d * 2 > 0                                           435            558         151          6.6         150.9       1.0X
d / 2 > 0                                          5569           6208         734          0.5        1933.2       0.1X
{noformat}

Current stack trace:
{noformat}
java.math.MutableBigInteger.divideKnuth(MutableBigInteger.java:1203)
java.math.MutableBigInteger.divideKnuth(MutableBigInteger.java:1163)
java.math.BigInteger.divideAndRemainderKnuth(BigInteger.java:2235)
java.math.BigInteger.divideAndRemainder(BigInteger.java:2223)
java.math.BigDecimal.createAndStripZerosToMatchScale(BigDecimal.java:4404)
java.math.BigDecimal.divideAndRound(BigDecimal.java:4294)
java.math.BigDecimal.divide(BigDecimal.java:4660)
java.math.BigDecimal.divide(BigDecimal.java:1753)
org.apache.spark.sql.types.Decimal.$div(Decimal.scala:505)
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.$anonfun$run$1(WriteToDataSourceV2Exec.scala:435)
org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$$$Lambda$2997/2025304705.apply(Unknown Source)
org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1538)
org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:480)
org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:381)
org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec$$Lambda$2987/1586195133.apply(Unknown Source)
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
org.apache.spark.scheduler.Task.run(Task.scala:139)
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
org.apache.spark.executor.Executor$TaskRunner$$Lambda$2921/365880128.apply(Unknown Source)
org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
java.lang.Thread.run(Thread.java:748)
{noformat}



  was:
How to reproduce this issue:

{code:scala}
    import org.apache.spark.benchmark.Benchmark

    val valuesPerIteration = 2880404L
    val dir = "/tmp/spark/benchmark"
    spark.range(2880404L).selectExpr("cast(id as DECIMAL(9,2)) as d").write.mode("Overwrite").parquet(dir)

    val benchmark = new Benchmark("Benchmark decimal", valuesPerIteration, minNumIters = 5)
    benchmark.addCase("d * 2 > 0") { _ =>
      spark.read.parquet(dir).where("d * 2 > 0").write.format("noop").mode("Overwrite").save()
    }

    benchmark.addCase("d / 2 > 0") { _ =>
      spark.read.parquet(dir).where("d / 2 > 0").write.format("noop").mode("Overwrite").save()
    }
    benchmark.run()
{code}

{noformat}
Java HotSpot(TM) 64-Bit Server VM 1.8.0_281-b09 on Mac OS X 10.15.7
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Benchmark decimal:                        Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
------------------------------------------------------------------------------------------------------------------------
d * 2 > 0                                           435            558         151          6.6         150.9       1.0X
d / 2 > 0                                          5569           6208         734          0.5        1933.2       0.1X
{noformat}




> Decimal divide much slower than multiply
> ----------------------------------------
>
>                 Key: SPARK-39248
>                 URL: https://issues.apache.org/jira/browse/SPARK-39248
>             Project: Spark
>          Issue Type: Improvement
>          Components: SQL
>    Affects Versions: 3.4.0
>            Reporter: Yuming Wang
>            Priority: Major
>
> How to reproduce this issue:
> {code:scala}
>     import org.apache.spark.benchmark.Benchmark
>     val valuesPerIteration = 2880404L
>     val dir = "/tmp/spark/benchmark"
>     spark.range(2880404L).selectExpr("cast(id as DECIMAL(9,2)) as d").write.mode("Overwrite").parquet(dir)
>     val benchmark = new Benchmark("Benchmark decimal", valuesPerIteration, minNumIters = 5)
>     benchmark.addCase("d * 2 > 0") { _ =>
>       spark.read.parquet(dir).where("d * 2 > 0").write.format("noop").mode("Overwrite").save()
>     }
>     benchmark.addCase("d / 2 > 0") { _ =>
>       spark.read.parquet(dir).where("d / 2 > 0").write.format("noop").mode("Overwrite").save()
>     }
>     benchmark.run()
> {code}
> {noformat}
> Java HotSpot(TM) 64-Bit Server VM 1.8.0_281-b09 on Mac OS X 10.15.7
> Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
> Benchmark decimal:                        Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
> ------------------------------------------------------------------------------------------------------------------------
> d * 2 > 0                                           435            558         151          6.6         150.9       1.0X
> d / 2 > 0                                          5569           6208         734          0.5        1933.2       0.1X
> {noformat}
> Current stack trace:
> {noformat}
> java.math.MutableBigInteger.divideKnuth(MutableBigInteger.java:1203)
> java.math.MutableBigInteger.divideKnuth(MutableBigInteger.java:1163)
> java.math.BigInteger.divideAndRemainderKnuth(BigInteger.java:2235)
> java.math.BigInteger.divideAndRemainder(BigInteger.java:2223)
> java.math.BigDecimal.createAndStripZerosToMatchScale(BigDecimal.java:4404)
> java.math.BigDecimal.divideAndRound(BigDecimal.java:4294)
> java.math.BigDecimal.divide(BigDecimal.java:4660)
> java.math.BigDecimal.divide(BigDecimal.java:1753)
> org.apache.spark.sql.types.Decimal.$div(Decimal.scala:505)
> org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
> org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
> org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
> org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.$anonfun$run$1(WriteToDataSourceV2Exec.scala:435)
> org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$$$Lambda$2997/2025304705.apply(Unknown Source)
> org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1538)
> org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:480)
> org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:381)
> org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec$$Lambda$2987/1586195133.apply(Unknown Source)
> org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
> org.apache.spark.scheduler.Task.run(Task.scala:139)
> org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
> org.apache.spark.executor.Executor$TaskRunner$$Lambda$2921/365880128.apply(Unknown Source)
> org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> java.lang.Thread.run(Thread.java:748)
> {noformat}



--
This message was sent by Atlassian Jira
(v8.20.7#820007)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org