You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2019/05/02 14:20:56 UTC
[spark] branch master updated: [SPARK-27607][SQL] Improve
Row.toString performance
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 7a8cc8e [SPARK-27607][SQL] Improve Row.toString performance
7a8cc8e is described below
commit 7a8cc8e071b7a0b1aace9dd3750dc89f0e4ad191
Author: Marco Gaido <ma...@gmail.com>
AuthorDate: Thu May 2 07:20:33 2019 -0700
[SPARK-27607][SQL] Improve Row.toString performance
## What changes were proposed in this pull request?
`Row.toString` is currently causing the useless creation of an `Array` containing all the values in the row before generating the string containing it. This operation adds a considerable overhead.
The PR proposes to avoid this operation in order to get a faster implementation.
## How was this patch tested?
Run
```scala
test("Row toString perf test") {
val n = 100000
val rows = (1 to n).map { i =>
Row(i, i.toDouble, i.toString, i.toShort, true, null)
}
// warmup
(1 to 10).foreach { _ => rows.foreach(_.toString) }
val times = (1 to 100).map { _ =>
val t0 = System.nanoTime()
rows.foreach(_.toString)
val t1 = System.nanoTime()
t1 - t0
}
// scalastyle:off println
println(s"Avg time on ${times.length} iterations for $n toString:" +
s" ${times.sum.toDouble / times.length / 1e6} ms")
// scalastyle:on println
}
```
Before the PR:
```
Avg time on 100 iterations for 100000 toString: 61.08408419 ms
```
After the PR:
```
Avg time on 100 iterations for 100000 toString: 38.16539432 ms
```
This means the new implementation is about 1.60X faster than the original one.
Closes #24505 from mgaido91/SPARK-27607.
Authored-by: Marco Gaido <ma...@gmail.com>
Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
.../src/main/scala/org/apache/spark/sql/Row.scala | 23 ++++++++++++++++++----
.../test/scala/org/apache/spark/sql/RowSuite.scala | 21 ++++++++++++++++++++
2 files changed, 40 insertions(+), 4 deletions(-)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
index f13edde..494387e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -372,7 +372,7 @@ trait Row extends Serializable {
}.toMap
}
- override def toString: String = s"[${this.mkString(",")}]"
+ override def toString: String = this.mkString("[", ",", "]")
/**
* Make a copy of the current [[Row]] object.
@@ -465,16 +465,31 @@ trait Row extends Serializable {
}
/** Displays all elements of this sequence in a string (without a separator). */
- def mkString: String = toSeq.mkString
+ def mkString: String = mkString("")
/** Displays all elements of this sequence in a string using a separator string. */
- def mkString(sep: String): String = toSeq.mkString(sep)
+ def mkString(sep: String): String = mkString("", sep, "")
/**
* Displays all elements of this traversable or iterator in a string using
* start, end, and separator strings.
*/
- def mkString(start: String, sep: String, end: String): String = toSeq.mkString(start, sep, end)
+ def mkString(start: String, sep: String, end: String): String = {
+ val n = length
+ val builder = new StringBuilder
+ builder.append(start)
+ if (n > 0) {
+ builder.append(get(0))
+ var i = 1
+ while (i < n) {
+ builder.append(sep)
+ builder.append(get(i))
+ i += 1
+ }
+ }
+ builder.append(end)
+ builder.toString()
+ }
/**
* Returns the value at position i.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
index 57b5f5e..c53fd5b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
@@ -84,4 +84,25 @@ class RowSuite extends SparkFunSuite with SharedSQLContext {
val r3 = Row("World")
assert(r3.hashCode() != r1.hashCode())
}
+
+ test("toString") {
+ val r1 = Row(2147483647, 21474.8364, (-5).toShort, "this is a string", true, null)
+ assert(r1.toString == "[2147483647,21474.8364,-5,this is a string,true,null]")
+ val r2 = Row(null, Int.MinValue, Double.NaN, Short.MaxValue, "", false)
+ assert(r2.toString == "[null,-2147483648,NaN,32767,,false]")
+ val tsString = "2019-05-01 17:30:12.0"
+ val dtString = "2019-05-01"
+ val r3 = Row(
+ r1,
+ Seq(1, 2, 3),
+ Map(1 -> "a", 2 -> "b"),
+ java.sql.Timestamp.valueOf(tsString),
+ java.sql.Date.valueOf(dtString),
+ BigDecimal("1234567890.1234567890"),
+ (-1).toByte)
+ assert(r3.toString == "[[2147483647,21474.8364,-5,this is a string,true,null],List(1, 2, 3)," +
+ s"Map(1 -> a, 2 -> b),$tsString,$dtString,1234567890.1234567890,-1]")
+ val empty = Row()
+ assert(empty.toString == "[]")
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org