You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2019/05/02 14:20:56 UTC

[spark] branch master updated: [SPARK-27607][SQL] Improve Row.toString performance

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 7a8cc8e  [SPARK-27607][SQL] Improve Row.toString performance
7a8cc8e is described below

commit 7a8cc8e071b7a0b1aace9dd3750dc89f0e4ad191
Author: Marco Gaido <ma...@gmail.com>
AuthorDate: Thu May 2 07:20:33 2019 -0700

    [SPARK-27607][SQL] Improve Row.toString performance
    
    ## What changes were proposed in this pull request?
    
    `Row.toString` is currently causing the useless creation of an `Array` containing all the values in the row before generating the string containing it. This operation adds a considerable overhead.
    
    The PR proposes to avoid this operation in order to get a faster implementation.
    
    ## How was this patch tested?
    
    Run
    
    ```scala
    test("Row toString perf test") {
        val n = 100000
        val rows = (1 to n).map { i =>
          Row(i, i.toDouble, i.toString, i.toShort, true, null)
        }
        // warmup
        (1 to 10).foreach { _ => rows.foreach(_.toString) }
    
        val times = (1 to 100).map { _ =>
          val t0 = System.nanoTime()
          rows.foreach(_.toString)
          val t1 = System.nanoTime()
          t1 - t0
        }
        // scalastyle:off println
        println(s"Avg time on ${times.length} iterations for $n toString:" +
          s" ${times.sum.toDouble / times.length / 1e6} ms")
        // scalastyle:on println
      }
    ```
    Before the PR:
    ```
    Avg time on 100 iterations for 100000 toString: 61.08408419 ms
    ```
    After the PR:
    ```
    Avg time on 100 iterations for 100000 toString: 38.16539432 ms
    ```
    This means the new implementation is about 1.60X faster than the original one.
    
    Closes #24505 from mgaido91/SPARK-27607.
    
    Authored-by: Marco Gaido <ma...@gmail.com>
    Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
 .../src/main/scala/org/apache/spark/sql/Row.scala  | 23 ++++++++++++++++++----
 .../test/scala/org/apache/spark/sql/RowSuite.scala | 21 ++++++++++++++++++++
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
index f13edde..494387e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -372,7 +372,7 @@ trait Row extends Serializable {
     }.toMap
   }
 
-  override def toString: String = s"[${this.mkString(",")}]"
+  override def toString: String = this.mkString("[", ",", "]")
 
   /**
    * Make a copy of the current [[Row]] object.
@@ -465,16 +465,31 @@ trait Row extends Serializable {
   }
 
   /** Displays all elements of this sequence in a string (without a separator). */
-  def mkString: String = toSeq.mkString
+  def mkString: String = mkString("")
 
   /** Displays all elements of this sequence in a string using a separator string. */
-  def mkString(sep: String): String = toSeq.mkString(sep)
+  def mkString(sep: String): String = mkString("", sep, "")
 
   /**
    * Displays all elements of this traversable or iterator in a string using
    * start, end, and separator strings.
    */
-  def mkString(start: String, sep: String, end: String): String = toSeq.mkString(start, sep, end)
+  def mkString(start: String, sep: String, end: String): String = {
+    val n = length
+    val builder = new StringBuilder
+    builder.append(start)
+    if (n > 0) {
+      builder.append(get(0))
+      var i = 1
+      while (i < n) {
+        builder.append(sep)
+        builder.append(get(i))
+        i += 1
+      }
+    }
+    builder.append(end)
+    builder.toString()
+  }
 
   /**
    * Returns the value at position i.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
index 57b5f5e..c53fd5b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
@@ -84,4 +84,25 @@ class RowSuite extends SparkFunSuite with SharedSQLContext {
     val r3 = Row("World")
     assert(r3.hashCode() != r1.hashCode())
   }
+
+  test("toString") {
+    val r1 = Row(2147483647, 21474.8364, (-5).toShort, "this is a string", true, null)
+    assert(r1.toString == "[2147483647,21474.8364,-5,this is a string,true,null]")
+    val r2 = Row(null, Int.MinValue, Double.NaN, Short.MaxValue, "", false)
+    assert(r2.toString == "[null,-2147483648,NaN,32767,,false]")
+    val tsString = "2019-05-01 17:30:12.0"
+    val dtString = "2019-05-01"
+    val r3 = Row(
+      r1,
+      Seq(1, 2, 3),
+      Map(1 -> "a", 2 -> "b"),
+      java.sql.Timestamp.valueOf(tsString),
+      java.sql.Date.valueOf(dtString),
+      BigDecimal("1234567890.1234567890"),
+      (-1).toByte)
+    assert(r3.toString == "[[2147483647,21474.8364,-5,this is a string,true,null],List(1, 2, 3)," +
+      s"Map(1 -> a, 2 -> b),$tsString,$dtString,1234567890.1234567890,-1]")
+    val empty = Row()
+    assert(empty.toString == "[]")
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org