You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by hv...@apache.org on 2016/11/08 11:02:02 UTC
spark git commit: [SPARK-18207][SQL] Fix a compilation error due to
HashExpression.doGenCode
Repository: spark
Updated Branches:
refs/heads/master 6f3697136 -> 47731e186
[SPARK-18207][SQL] Fix a compilation error due to HashExpression.doGenCode
## What changes were proposed in this pull request?
This PR avoids a compilation error due to more than 64KB Java byte code size. This error occur since generate java code for computing a hash value for a row is too big. This PR fixes this compilation error by splitting a big code chunk into multiple methods by calling `CodegenContext.splitExpression` at `HashExpression.doGenCode`
The test case requires a calculation of hash code for a row that includes 1000 String fields. `HashExpression.doGenCode` generate a lot of Java code for this computation into one function. As a result, the size of the corresponding Java bytecode is more than 64 KB.
Generated code without this PR
````java
/* 027 */ public UnsafeRow apply(InternalRow i) {
/* 028 */ boolean isNull = false;
/* 029 */
/* 030 */ int value1 = 42;
/* 031 */
/* 032 */ boolean isNull2 = i.isNullAt(0);
/* 033 */ UTF8String value2 = isNull2 ? null : (i.getUTF8String(0));
/* 034 */ if (!isNull2) {
/* 035 */ value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value2.getBaseObject(), value2.getBaseOffset(), value2.numBytes(), value1);
/* 036 */ }
/* 037 */
/* 038 */
/* 039 */ boolean isNull3 = i.isNullAt(1);
/* 040 */ UTF8String value3 = isNull3 ? null : (i.getUTF8String(1));
/* 041 */ if (!isNull3) {
/* 042 */ value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value3.getBaseObject(), value3.getBaseOffset(), value3.numBytes(), value1);
/* 043 */ }
/* 044 */
/* 045 */
...
/* 7024 */
/* 7025 */ boolean isNull1001 = i.isNullAt(999);
/* 7026 */ UTF8String value1001 = isNull1001 ? null : (i.getUTF8String(999));
/* 7027 */ if (!isNull1001) {
/* 7028 */ value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value1001.getBaseObject(), value1001.getBaseOffset(), value1001.numBytes(), value1);
/* 7029 */ }
/* 7030 */
/* 7031 */
/* 7032 */ boolean isNull1002 = i.isNullAt(1000);
/* 7033 */ UTF8String value1002 = isNull1002 ? null : (i.getUTF8String(1000));
/* 7034 */ if (!isNull1002) {
/* 7035 */ value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value1002.getBaseObject(), value1002.getBaseOffset(), value1002.numBytes(), value1);
/* 7036 */ }
````
Generated code with this PR
````java
/* 3807 */ private void apply_249(InternalRow i) {
/* 3808 */
/* 3809 */ boolean isNull998 = i.isNullAt(996);
/* 3810 */ UTF8String value998 = isNull998 ? null : (i.getUTF8String(996));
/* 3811 */ if (!isNull998) {
/* 3812 */ value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value998.getBaseObject(), value998.getBaseOffset(), value998.numBytes(), value1);
/* 3813 */ }
/* 3814 */
/* 3815 */ boolean isNull999 = i.isNullAt(997);
/* 3816 */ UTF8String value999 = isNull999 ? null : (i.getUTF8String(997));
/* 3817 */ if (!isNull999) {
/* 3818 */ value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value999.getBaseObject(), value999.getBaseOffset(), value999.numBytes(), value1);
/* 3819 */ }
/* 3820 */
/* 3821 */ boolean isNull1000 = i.isNullAt(998);
/* 3822 */ UTF8String value1000 = isNull1000 ? null : (i.getUTF8String(998));
/* 3823 */ if (!isNull1000) {
/* 3824 */ value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value1000.getBaseObject(), value1000.getBaseOffset(), value1000.numBytes(), value1);
/* 3825 */ }
/* 3826 */
/* 3827 */ boolean isNull1001 = i.isNullAt(999);
/* 3828 */ UTF8String value1001 = isNull1001 ? null : (i.getUTF8String(999));
/* 3829 */ if (!isNull1001) {
/* 3830 */ value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value1001.getBaseObject(), value1001.getBaseOffset(), value1001.numBytes(), value1);
/* 3831 */ }
/* 3832 */
/* 3833 */ }
/* 3834 */
...
/* 4532 */ private void apply_0(InternalRow i) {
/* 4533 */
/* 4534 */ boolean isNull2 = i.isNullAt(0);
/* 4535 */ UTF8String value2 = isNull2 ? null : (i.getUTF8String(0));
/* 4536 */ if (!isNull2) {
/* 4537 */ value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value2.getBaseObject(), value2.getBaseOffset(), value2.numBytes(), value1);
/* 4538 */ }
/* 4539 */
/* 4540 */ boolean isNull3 = i.isNullAt(1);
/* 4541 */ UTF8String value3 = isNull3 ? null : (i.getUTF8String(1));
/* 4542 */ if (!isNull3) {
/* 4543 */ value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value3.getBaseObject(), value3.getBaseOffset(), value3.numBytes(), value1);
/* 4544 */ }
/* 4545 */
/* 4546 */ boolean isNull4 = i.isNullAt(2);
/* 4547 */ UTF8String value4 = isNull4 ? null : (i.getUTF8String(2));
/* 4548 */ if (!isNull4) {
/* 4549 */ value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value4.getBaseObject(), value4.getBaseOffset(), value4.numBytes(), value1);
/* 4550 */ }
/* 4551 */
/* 4552 */ boolean isNull5 = i.isNullAt(3);
/* 4553 */ UTF8String value5 = isNull5 ? null : (i.getUTF8String(3));
/* 4554 */ if (!isNull5) {
/* 4555 */ value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value5.getBaseObject(), value5.getBaseOffset(), value5.numBytes(), value1);
/* 4556 */ }
/* 4557 */
/* 4558 */ }
...
/* 7344 */ public UnsafeRow apply(InternalRow i) {
/* 7345 */ boolean isNull = false;
/* 7346 */
/* 7347 */ value1 = 42;
/* 7348 */ apply_0(i);
/* 7349 */ apply_1(i);
...
/* 7596 */ apply_248(i);
/* 7597 */ apply_249(i);
/* 7598 */ apply_250(i);
/* 7599 */ apply_251(i);
...
````
## How was this patch tested?
Add a new test in `DataFrameSuite`
Author: Kazuaki Ishizaki <is...@jp.ibm.com>
Closes #15745 from kiszk/SPARK-18207.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/47731e18
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/47731e18
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/47731e18
Branch: refs/heads/master
Commit: 47731e1865fa1e3a8881a1f4420017bdc026e455
Parents: 6f36971
Author: Kazuaki Ishizaki <is...@jp.ibm.com>
Authored: Tue Nov 8 12:01:54 2016 +0100
Committer: Herman van Hovell <hv...@databricks.com>
Committed: Tue Nov 8 12:01:54 2016 +0100
----------------------------------------------------------------------
.../spark/sql/catalyst/expressions/hash.scala | 18 +++++++++-------
.../expressions/HashExpressionsSuite.scala | 22 ++++++++++++++++++++
2 files changed, 33 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/47731e18/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
index 415ef4e..e14f054 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
@@ -268,15 +268,16 @@ abstract class HashExpression[E] extends Expression {
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
ev.isNull = "false"
- val childrenHash = children.map { child =>
+ val childrenHash = ctx.splitExpressions(ctx.INPUT_ROW, children.map { child =>
val childGen = child.genCode(ctx)
childGen.code + ctx.nullSafeExec(child.nullable, childGen.isNull) {
computeHash(childGen.value, child.dataType, ev.value, ctx)
}
- }.mkString("\n")
+ })
+ ctx.addMutableState(ctx.javaType(dataType), ev.value, "")
ev.copy(code = s"""
- ${ctx.javaType(dataType)} ${ev.value} = $seed;
+ ${ev.value} = $seed;
$childrenHash""")
}
@@ -600,15 +601,18 @@ case class HiveHash(children: Seq[Expression]) extends HashExpression[Int] {
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
ev.isNull = "false"
val childHash = ctx.freshName("childHash")
- val childrenHash = children.map { child =>
+ val childrenHash = ctx.splitExpressions(ctx.INPUT_ROW, children.map { child =>
val childGen = child.genCode(ctx)
childGen.code + ctx.nullSafeExec(child.nullable, childGen.isNull) {
computeHash(childGen.value, child.dataType, childHash, ctx)
- } + s"${ev.value} = (31 * ${ev.value}) + $childHash;"
- }.mkString(s"int $childHash = 0;", s"\n$childHash = 0;\n", "")
+ } + s"${ev.value} = (31 * ${ev.value}) + $childHash;" +
+ s"\n$childHash = 0;"
+ })
+ ctx.addMutableState(ctx.javaType(dataType), ev.value, "")
+ ctx.addMutableState("int", childHash, s"$childHash = 0;")
ev.copy(code = s"""
- ${ctx.javaType(dataType)} ${ev.value} = $seed;
+ ${ev.value} = $seed;
$childrenHash""")
}
http://git-wip-us.apache.org/repos/asf/spark/blob/47731e18/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
index c714bc0..0326292 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
@@ -24,7 +24,9 @@ import org.apache.commons.codec.digest.DigestUtils
import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.{RandomDataGenerator, Row}
import org.apache.spark.sql.catalyst.encoders.{ExamplePointUDT, RowEncoder}
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
class HashExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
@@ -124,6 +126,26 @@ class HashExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
new StructType().add("array", arrayOfString).add("map", mapOfString))
.add("structOfUDT", structOfUDT))
+ test("SPARK-18207: Compute hash for a lot of expressions") {
+ val N = 1000
+ val wideRow = new GenericInternalRow(
+ Seq.tabulate(N)(i => UTF8String.fromString(i.toString)).toArray[Any])
+ val schema = StructType((1 to N).map(i => StructField("", StringType)))
+
+ val exprs = schema.fields.zipWithIndex.map { case (f, i) =>
+ BoundReference(i, f.dataType, true)
+ }
+ val murmur3HashExpr = Murmur3Hash(exprs, 42)
+ val murmur3HashPlan = GenerateMutableProjection.generate(Seq(murmur3HashExpr))
+ val murmursHashEval = Murmur3Hash(exprs, 42).eval(wideRow)
+ assert(murmur3HashPlan(wideRow).getInt(0) == murmursHashEval)
+
+ val hiveHashExpr = HiveHash(exprs)
+ val hiveHashPlan = GenerateMutableProjection.generate(Seq(hiveHashExpr))
+ val hiveHashEval = HiveHash(exprs).eval(wideRow)
+ assert(hiveHashPlan(wideRow).getInt(0) == hiveHashEval)
+ }
+
private def testHash(inputSchema: StructType): Unit = {
val inputGenerator = RandomDataGenerator.forType(inputSchema, nullable = false).get
val encoder = RowEncoder(inputSchema)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org