You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2016/01/21 00:08:30 UTC

spark git commit: [SPARK-12888][SQL] benchmark the new hash expression

Repository: spark
Updated Branches:
  refs/heads/master 8f90c1518 -> f3934a8d6


[SPARK-12888][SQL] benchmark the new hash expression

Benchmark it on 4 different schemas, the result:
```
Intel(R) Core(TM) i7-4960HQ CPU  2.60GHz
Hash For simple:                   Avg Time(ms)    Avg Rate(M/s)  Relative Rate
-------------------------------------------------------------------------------
interpreted version                       31.47           266.54         1.00 X
codegen version                           64.52           130.01         0.49 X
```

```
Intel(R) Core(TM) i7-4960HQ CPU  2.60GHz
Hash For normal:                   Avg Time(ms)    Avg Rate(M/s)  Relative Rate
-------------------------------------------------------------------------------
interpreted version                     4068.11             0.26         1.00 X
codegen version                         1175.92             0.89         3.46 X
```

```
Intel(R) Core(TM) i7-4960HQ CPU  2.60GHz
Hash For array:                    Avg Time(ms)    Avg Rate(M/s)  Relative Rate
-------------------------------------------------------------------------------
interpreted version                     9276.70             0.06         1.00 X
codegen version                        14762.23             0.04         0.63 X
```

```
Intel(R) Core(TM) i7-4960HQ CPU  2.60GHz
Hash For map:                      Avg Time(ms)    Avg Rate(M/s)  Relative Rate
-------------------------------------------------------------------------------
interpreted version                    58869.79             0.01         1.00 X
codegen version                         9285.36             0.06         6.34 X
```

Author: Wenchen Fan <we...@databricks.com>

Closes #10816 from cloud-fan/hash-benchmark.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f3934a8d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f3934a8d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f3934a8d

Branch: refs/heads/master
Commit: f3934a8d656f1668bec065751b2a11411229b6f5
Parents: 8f90c15
Author: Wenchen Fan <we...@databricks.com>
Authored: Wed Jan 20 15:08:27 2016 -0800
Committer: Reynold Xin <rx...@databricks.com>
Committed: Wed Jan 20 15:08:27 2016 -0800

----------------------------------------------------------------------
 .../org/apache/spark/sql/HashBenchmark.scala    | 104 +++++++++++++++++++
 1 file changed, 104 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/f3934a8d/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala
new file mode 100644
index 0000000..184f845
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.catalyst.encoders.RowEncoder
+import org.apache.spark.sql.catalyst.expressions.{Murmur3Hash, UnsafeProjection}
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection
+import org.apache.spark.sql.types._
+import org.apache.spark.util.Benchmark
+
+/**
+ * Benchmark for the previous interpreted hash function(InternalRow.hashCode) vs the new codegen
+ * hash expression(Murmur3Hash).
+ */
+object HashBenchmark {
+
+  def test(name: String, schema: StructType, iters: Int): Unit = {
+    val numRows = 1024 * 8
+
+    val generator = RandomDataGenerator.forType(schema, nullable = false).get
+    val encoder = RowEncoder(schema)
+    val attrs = schema.toAttributes
+    val safeProjection = GenerateSafeProjection.generate(attrs, attrs)
+
+    val rows = (1 to numRows).map(_ =>
+      // The output of encoder is UnsafeRow, use safeProjection to turn in into safe format.
+      safeProjection(encoder.toRow(generator().asInstanceOf[Row])).copy()
+    ).toArray
+
+    val benchmark = new Benchmark("Hash For " + name, iters * numRows)
+    benchmark.addCase("interpreted version") { _: Int =>
+      for (_ <- 0L until iters) {
+        var sum = 0
+        var i = 0
+        while (i < numRows) {
+          sum += rows(i).hashCode()
+          i += 1
+        }
+      }
+    }
+
+    val getHashCode = UnsafeProjection.create(new Murmur3Hash(attrs) :: Nil, attrs)
+    benchmark.addCase("codegen version") { _: Int =>
+      for (_ <- 0L until iters) {
+        var sum = 0
+        var i = 0
+        while (i < numRows) {
+          sum += getHashCode(rows(i)).getInt(0)
+          i += 1
+        }
+      }
+    }
+    benchmark.run()
+  }
+
+  def main(args: Array[String]): Unit = {
+    val simple = new StructType().add("i", IntegerType)
+    test("simple", simple, 1024)
+
+    val normal = new StructType()
+      .add("null", NullType)
+      .add("boolean", BooleanType)
+      .add("byte", ByteType)
+      .add("short", ShortType)
+      .add("int", IntegerType)
+      .add("long", LongType)
+      .add("float", FloatType)
+      .add("double", DoubleType)
+      .add("bigDecimal", DecimalType.SYSTEM_DEFAULT)
+      .add("smallDecimal", DecimalType.USER_DEFAULT)
+      .add("string", StringType)
+      .add("binary", BinaryType)
+      .add("date", DateType)
+      .add("timestamp", TimestampType)
+    test("normal", normal, 128)
+
+    val arrayOfInt = ArrayType(IntegerType)
+    val array = new StructType()
+      .add("array", arrayOfInt)
+      .add("arrayOfArray", ArrayType(arrayOfInt))
+    test("array", array, 64)
+
+    val mapOfInt = MapType(IntegerType, IntegerType)
+    val map = new StructType()
+      .add("map", mapOfInt)
+      .add("mapOfMap", MapType(IntegerType, mapOfInt))
+    test("map", map, 64)
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org