You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2016/08/05 19:56:00 UTC

spark git commit: [SPARK-16826][SQL] Switch to java.net.URI for parse_url()

Repository: spark
Updated Branches:
  refs/heads/master 39a2b2ea7 -> 2460f03ff


[SPARK-16826][SQL] Switch to java.net.URI for parse_url()

## What changes were proposed in this pull request?
The java.net.URL class has a globally synchronized Hashtable, which limits the throughput of any single executor doing lots of calls to parse_url(). Tests have shown that a 36-core machine can only get to 10% CPU use because the threads are locked most of the time.

This patch switches to java.net.URI which has less features than java.net.URL but focuses on URI parsing, which is enough for parse_url().

New tests were added to make sure a few common edge cases didn't change behaviour.
https://issues.apache.org/jira/browse/SPARK-16826

## How was this patch tested?
I've kept the old URL code commented for now, so that people can verify that the new unit tests do pass with java.net.URL.

Thanks to srowen for the help!

Author: Sylvain Zimmer <sy...@sylvainzimmer.com>

Closes #14488 from sylvinus/master.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2460f03f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2460f03f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2460f03f

Branch: refs/heads/master
Commit: 2460f03ffe94154b73995e4f16dd799d1a0f56b8
Parents: 39a2b2e
Author: Sylvain Zimmer <sy...@sylvainzimmer.com>
Authored: Fri Aug 5 20:55:58 2016 +0100
Committer: Sean Owen <so...@cloudera.com>
Committed: Fri Aug 5 20:55:58 2016 +0100

----------------------------------------------------------------------
 .../expressions/stringExpressions.scala         | 47 ++++++++++++++------
 .../apache/spark/sql/StringFunctionsSuite.scala | 40 ++++++++++++++---
 2 files changed, 68 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/2460f03f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index fc13845..a8c23a8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import java.net.{MalformedURLException, URL}
+import java.net.{URI, URISyntaxException}
 import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols}
 import java.util.{HashMap, Locale, Map => JMap}
 import java.util.regex.Pattern
@@ -749,25 +749,44 @@ case class ParseUrl(children: Seq[Expression])
     Pattern.compile(REGEXPREFIX + key.toString + REGEXSUBFIX)
   }
 
-  private def getUrl(url: UTF8String): URL = {
+  private def getUrl(url: UTF8String): URI = {
     try {
-      new URL(url.toString)
+      new URI(url.toString)
     } catch {
-      case e: MalformedURLException => null
+      case e: URISyntaxException => null
     }
   }
 
-  private def getExtractPartFunc(partToExtract: UTF8String): URL => String = {
+  private def getExtractPartFunc(partToExtract: UTF8String): URI => String = {
+
+    // partToExtract match {
+    //   case HOST => _.toURL().getHost
+    //   case PATH => _.toURL().getPath
+    //   case QUERY => _.toURL().getQuery
+    //   case REF => _.toURL().getRef
+    //   case PROTOCOL => _.toURL().getProtocol
+    //   case FILE => _.toURL().getFile
+    //   case AUTHORITY => _.toURL().getAuthority
+    //   case USERINFO => _.toURL().getUserInfo
+    //   case _ => (url: URI) => null
+    // }
+
     partToExtract match {
       case HOST => _.getHost
-      case PATH => _.getPath
-      case QUERY => _.getQuery
-      case REF => _.getRef
-      case PROTOCOL => _.getProtocol
-      case FILE => _.getFile
-      case AUTHORITY => _.getAuthority
-      case USERINFO => _.getUserInfo
-      case _ => (url: URL) => null
+      case PATH => _.getRawPath
+      case QUERY => _.getRawQuery
+      case REF => _.getRawFragment
+      case PROTOCOL => _.getScheme
+      case FILE =>
+        (url: URI) =>
+          if (url.getRawQuery ne null) {
+            url.getRawPath + "?" + url.getRawQuery
+          } else {
+            url.getRawPath
+          }
+      case AUTHORITY => _.getRawAuthority
+      case USERINFO => _.getRawUserInfo
+      case _ => (url: URI) => null
     }
   }
 
@@ -780,7 +799,7 @@ case class ParseUrl(children: Seq[Expression])
     }
   }
 
-  private def extractFromUrl(url: URL, partToExtract: UTF8String): UTF8String = {
+  private def extractFromUrl(url: URI, partToExtract: UTF8String): UTF8String = {
     if (cachedExtractPartFunc ne null) {
       UTF8String.fromString(cachedExtractPartFunc.apply(url))
     } else {

http://git-wip-us.apache.org/repos/asf/spark/blob/2460f03f/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 524926e..57ca5d9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -229,18 +229,48 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext {
   }
 
   test("string parse_url function") {
-    val df = Seq[String](("http://userinfo@spark.apache.org/path?query=1#Ref"))
-      .toDF("url")
 
-    checkAnswer(
-      df.selectExpr(
+    def testUrl(url: String, expected: Row) {
+      checkAnswer(Seq[String]((url)).toDF("url").selectExpr(
         "parse_url(url, 'HOST')", "parse_url(url, 'PATH')",
         "parse_url(url, 'QUERY')", "parse_url(url, 'REF')",
         "parse_url(url, 'PROTOCOL')", "parse_url(url, 'FILE')",
         "parse_url(url, 'AUTHORITY')", "parse_url(url, 'USERINFO')",
-        "parse_url(url, 'QUERY', 'query')"),
+        "parse_url(url, 'QUERY', 'query')"), expected)
+    }
+
+    testUrl(
+      "http://userinfo@spark.apache.org/path?query=1#Ref",
       Row("spark.apache.org", "/path", "query=1", "Ref",
         "http", "/path?query=1", "userinfo@spark.apache.org", "userinfo", "1"))
+
+    testUrl(
+      "https://use%20r:pas%20s@example.com/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two",
+      Row("example.com", "/dir%20/pa%20th.HTML", "query=x%20y&q2=2", "Ref%20two",
+        "https", "/dir%20/pa%20th.HTML?query=x%20y&q2=2", "use%20r:pas%20s@example.com",
+        "use%20r:pas%20s", "x%20y"))
+
+    testUrl(
+      "http://user:pass@host",
+      Row("host", "", null, null, "http", "", "user:pass@host", "user:pass", null))
+
+    testUrl(
+      "http://user:pass@host/",
+      Row("host", "/", null, null, "http", "/", "user:pass@host", "user:pass", null))
+
+    testUrl(
+      "http://user:pass@host/?#",
+      Row("host", "/", "", "", "http", "/?", "user:pass@host", "user:pass", null))
+
+    testUrl(
+      "http://user:pass@host/file;param?query;p2",
+      Row("host", "/file;param", "query;p2", null, "http", "/file;param?query;p2",
+        "user:pass@host", "user:pass", null))
+
+    testUrl(
+      "inva lid://user:pass@host/file;param?query;p2",
+      Row(null, null, null, null, null, null, null, null, null))
+
   }
 
   test("string repeat function") {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org