You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2018/04/12 14:21:40 UTC
spark git commit: [SPARK-23762][SQL] UTF8StringBuffer uses MemoryBlock
Repository: spark
Updated Branches:
refs/heads/master 6a2289ecf -> 0b19122d4
[SPARK-23762][SQL] UTF8StringBuffer uses MemoryBlock
## What changes were proposed in this pull request?
This PR tries to use `MemoryBlock` in `UTF8StringBuffer`. In general, there are two advantages to use `MemoryBlock`.
1. Has clean API calls rather than using a Java array or `PlatformMemory`
2. Improve runtime performance of memory access instead of using `Object`.
## How was this patch tested?
Added `UTF8StringBufferSuite`
Author: Kazuaki Ishizaki <is...@jp.ibm.com>
Closes #20871 from kiszk/SPARK-23762.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0b19122d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0b19122d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0b19122d
Branch: refs/heads/master
Commit: 0b19122d434e39eb117ccc3174a0688c9c874d48
Parents: 6a2289e
Author: Kazuaki Ishizaki <is...@jp.ibm.com>
Authored: Thu Apr 12 22:21:30 2018 +0800
Committer: Wenchen Fan <we...@databricks.com>
Committed: Thu Apr 12 22:21:30 2018 +0800
----------------------------------------------------------------------
.../expressions/codegen/UTF8StringBuilder.java | 35 +++++++---------
.../codegen/UTF8StringBuilderSuite.scala | 42 ++++++++++++++++++++
2 files changed, 56 insertions(+), 21 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/0b19122d/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java
index f0f66ba..f8000d7 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java
@@ -19,6 +19,8 @@ package org.apache.spark.sql.catalyst.expressions.codegen;
import org.apache.spark.unsafe.Platform;
import org.apache.spark.unsafe.array.ByteArrayMethods;
+import org.apache.spark.unsafe.memory.ByteArrayMemoryBlock;
+import org.apache.spark.unsafe.memory.MemoryBlock;
import org.apache.spark.unsafe.types.UTF8String;
/**
@@ -29,43 +31,34 @@ public class UTF8StringBuilder {
private static final int ARRAY_MAX = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH;
- private byte[] buffer;
- private int cursor = Platform.BYTE_ARRAY_OFFSET;
+ private ByteArrayMemoryBlock buffer;
+ private int length = 0;
public UTF8StringBuilder() {
// Since initial buffer size is 16 in `StringBuilder`, we set the same size here
- this.buffer = new byte[16];
+ this.buffer = new ByteArrayMemoryBlock(16);
}
// Grows the buffer by at least `neededSize`
private void grow(int neededSize) {
- if (neededSize > ARRAY_MAX - totalSize()) {
+ if (neededSize > ARRAY_MAX - length) {
throw new UnsupportedOperationException(
"Cannot grow internal buffer by size " + neededSize + " because the size after growing " +
"exceeds size limitation " + ARRAY_MAX);
}
- final int length = totalSize() + neededSize;
- if (buffer.length < length) {
- int newLength = length < ARRAY_MAX / 2 ? length * 2 : ARRAY_MAX;
- final byte[] tmp = new byte[newLength];
- Platform.copyMemory(
- buffer,
- Platform.BYTE_ARRAY_OFFSET,
- tmp,
- Platform.BYTE_ARRAY_OFFSET,
- totalSize());
+ final int requestedSize = length + neededSize;
+ if (buffer.size() < requestedSize) {
+ int newLength = requestedSize < ARRAY_MAX / 2 ? requestedSize * 2 : ARRAY_MAX;
+ final ByteArrayMemoryBlock tmp = new ByteArrayMemoryBlock(newLength);
+ MemoryBlock.copyMemory(buffer, tmp, length);
buffer = tmp;
}
}
- private int totalSize() {
- return cursor - Platform.BYTE_ARRAY_OFFSET;
- }
-
public void append(UTF8String value) {
grow(value.numBytes());
- value.writeToMemory(buffer, cursor);
- cursor += value.numBytes();
+ value.writeToMemory(buffer.getByteArray(), length + Platform.BYTE_ARRAY_OFFSET);
+ length += value.numBytes();
}
public void append(String value) {
@@ -73,6 +66,6 @@ public class UTF8StringBuilder {
}
public UTF8String build() {
- return UTF8String.fromBytes(buffer, 0, totalSize());
+ return UTF8String.fromBytes(buffer.getByteArray(), 0, length);
}
}
http://git-wip-us.apache.org/repos/asf/spark/blob/0b19122d/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilderSuite.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilderSuite.scala
new file mode 100644
index 0000000..1b25a4b
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilderSuite.scala
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.unsafe.types.UTF8String
+
+class UTF8StringBuilderSuite extends SparkFunSuite {
+
+ test("basic test") {
+ val sb = new UTF8StringBuilder()
+ assert(sb.build() === UTF8String.EMPTY_UTF8)
+
+ sb.append("")
+ assert(sb.build() === UTF8String.EMPTY_UTF8)
+
+ sb.append("abcd")
+ assert(sb.build() === UTF8String.fromString("abcd"))
+
+ sb.append(UTF8String.fromString("1234"))
+ assert(sb.build() === UTF8String.fromString("abcd1234"))
+
+ // expect to grow an internal buffer
+ sb.append(UTF8String.fromString("efgijk567890"))
+ assert(sb.build() === UTF8String.fromString("abcd1234efgijk567890"))
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org