You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2015/07/21 03:21:09 UTC
spark git commit: [SPARK-9156][SQL] codegen StringSplit
Repository: spark
Updated Branches:
refs/heads/master 047ccc8c9 -> 6853ac7c8
[SPARK-9156][SQL] codegen StringSplit
Jira: https://issues.apache.org/jira/browse/SPARK-9156
Author: Tarek Auel <ta...@googlemail.com>
Closes #7547 from tarekauel/SPARK-9156 and squashes the following commits:
0be2700 [Tarek Auel] [SPARK-9156][SQL] indention fix
b860eaf [Tarek Auel] [SPARK-9156][SQL] codegen StringSplit
5ad6a1f [Tarek Auel] [SPARK-9156] codegen StringSplit
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6853ac7c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6853ac7c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6853ac7c
Branch: refs/heads/master
Commit: 6853ac7c8c76003160fc861ddcc8e8e39e4a5924
Parents: 047ccc8
Author: Tarek Auel <ta...@googlemail.com>
Authored: Mon Jul 20 18:21:05 2015 -0700
Committer: Reynold Xin <rx...@databricks.com>
Committed: Mon Jul 20 18:21:05 2015 -0700
----------------------------------------------------------------------
.../sql/catalyst/expressions/stringOperations.scala | 12 ++++++++----
.../java/org/apache/spark/unsafe/types/UTF8String.java | 9 +++++++++
.../org/apache/spark/unsafe/types/UTF8StringSuite.java | 11 +++++++++++
3 files changed, 28 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/6853ac7c/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index a568242..5c1908d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -615,7 +615,7 @@ case class StringSpace(child: Expression)
* Splits str around pat (pattern is a regular expression).
*/
case class StringSplit(str: Expression, pattern: Expression)
- extends BinaryExpression with ImplicitCastInputTypes with CodegenFallback {
+ extends BinaryExpression with ImplicitCastInputTypes {
override def left: Expression = str
override def right: Expression = pattern
@@ -623,9 +623,13 @@ case class StringSplit(str: Expression, pattern: Expression)
override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
override def nullSafeEval(string: Any, regex: Any): Any = {
- val splits =
- string.asInstanceOf[UTF8String].toString.split(regex.asInstanceOf[UTF8String].toString, -1)
- splits.toSeq.map(UTF8String.fromString)
+ string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1).toSeq
+ }
+
+ override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+ nullSafeCodeGen(ctx, ev, (str, pattern) =>
+ s"""${ev.primitive} = scala.collection.JavaConversions.asScalaBuffer(
+ java.util.Arrays.asList($str.split($pattern, -1)));""")
}
override def prettyName: String = "split"
http://git-wip-us.apache.org/repos/asf/spark/blob/6853ac7c/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
----------------------------------------------------------------------
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index fc63fe5..ed354f7 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -487,6 +487,15 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
return fromBytes(result);
}
+ public UTF8String[] split(UTF8String pattern, int limit) {
+ String[] splits = toString().split(pattern.toString(), limit);
+ UTF8String[] res = new UTF8String[splits.length];
+ for (int i = 0; i < res.length; i++) {
+ res[i] = fromString(splits[i]);
+ }
+ return res;
+ }
+
@Override
public String toString() {
try {
http://git-wip-us.apache.org/repos/asf/spark/blob/6853ac7c/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
----------------------------------------------------------------------
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index d730b1d..1f5572c 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -18,6 +18,7 @@
package org.apache.spark.unsafe.types;
import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
import org.junit.Test;
@@ -270,6 +271,16 @@ public class UTF8StringSuite {
fromString("数据砖头孙行者孙行者孙行"),
fromString("数据砖头").rpad(12, fromString("孙行者")));
}
+
+ @Test
+ public void split() {
+ assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), -1),
+ new UTF8String[]{fromString("ab"), fromString("def"), fromString("ghi")}));
+ assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), 2),
+ new UTF8String[]{fromString("ab"), fromString("def,ghi")}));
+ assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), 2),
+ new UTF8String[]{fromString("ab"), fromString("def,ghi")}));
+ }
@Test
public void levenshteinDistance() {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org