You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafficserver.apache.org by bc...@apache.org on 2020/02/05 02:03:48 UTC

[trafficserver] branch 8.0.x updated: Fixed how we handle uknown schemes

This is an automated email from the ASF dual-hosted git repository.

bcall pushed a commit to branch 8.0.x
in repository https://gitbox.apache.org/repos/asf/trafficserver.git


The following commit(s) were added to refs/heads/8.0.x by this push:
     new 6affb92  Fixed how we handle uknown schemes
6affb92 is described below

commit 6affb92d8c85896e8b60abb9f684af8109280a0a
Author: Bryan Call <bc...@apache.org>
AuthorDate: Thu Jan 30 10:42:00 2020 -0800

    Fixed how we handle uknown schemes
    
    (cherry picked from commit c20a369fa93b22b492e6aed8bee1da5fa4af1cab)
---
 proxy/hdrs/HdrTest.cc | 101 +++++++++++++++++++++++++++++++++++++++-----------
 proxy/hdrs/URL.cc     |  49 +++++++++++++-----------
 2 files changed, 108 insertions(+), 42 deletions(-)

diff --git a/proxy/hdrs/HdrTest.cc b/proxy/hdrs/HdrTest.cc
index 49107ca..8046e69 100644
--- a/proxy/hdrs/HdrTest.cc
+++ b/proxy/hdrs/HdrTest.cc
@@ -343,44 +343,100 @@ HdrTest::test_url()
     // Start with an easy one...
     "http://trafficserver.apache.org/index.html",
 
-    // "cheese://bogosity",         This fails, but it's not clear it should work...
-
-    "some.place", "some.place/", "http://some.place", "http://some.place/", "http://some.place/path",
-    "http://some.place/path;params", "http://some.place/path;params?query", "http://some.place/path;params?query#fragment",
-    "http://some.place/path?query#fragment", "http://some.place/path#fragment",
+    "cheese://bogosity",
+
+    "some.place",
+    "some.place/",
+    "http://some.place",
+    "http://some.place/",
+    "http://some.place/path",
+    "http://some.place/path;params",
+    "http://some.place/path;params?query",
+    "http://some.place/path;params?query#fragment",
+    "http://some.place/path?query#fragment",
+    "http://some.place/path#fragment",
 
-    "some.place:80", "some.place:80/", "http://some.place:80", "http://some.place:80/",
+    "some.place:80",
+    "some.place:80/",
+    "http://some.place:80",
+    "http://some.place:80/",
 
-    "foo@some.place:80", "foo@some.place:80/", "http://foo@some.place:80", "http://foo@some.place:80/",
+    "foo@some.place:80",
+    "foo@some.place:80/",
+    "http://foo@some.place:80",
+    "http://foo@some.place:80/",
 
-    "foo:bar@some.place:80", "foo:bar@some.place:80/", "http://foo:bar@some.place:80", "http://foo:bar@some.place:80/",
+    "foo:bar@some.place:80",
+    "foo:bar@some.place:80/",
+    "http://foo:bar@some.place:80",
+    "http://foo:bar@some.place:80/",
 
     // Some address stuff
-    "http://172.16.28.101", "http://172.16.28.101:8080", "http://[::]", "http://[::1]", "http://[fc01:172:16:28::101]",
-    "http://[fc01:172:16:28::101]:80", "http://[fc01:172:16:28:BAAD:BEEF:DEAD:101]",
-    "http://[fc01:172:16:28:BAAD:BEEF:DEAD:101]:8080", "http://172.16.28.101/some/path", "http://172.16.28.101:8080/some/path",
-    "http://[::1]/some/path", "http://[fc01:172:16:28::101]/some/path", "http://[fc01:172:16:28::101]:80/some/path",
-    "http://[fc01:172:16:28:BAAD:BEEF:DEAD:101]/some/path", "http://[fc01:172:16:28:BAAD:BEEF:DEAD:101]:8080/some/path",
-    "http://172.16.28.101/", "http://[fc01:172:16:28:BAAD:BEEF:DEAD:101]:8080/",
-
-    "foo:bar@some.place", "foo:bar@some.place/", "http://foo:bar@some.place", "http://foo:bar@some.place/",
-    "http://foo:bar@[::1]:8080/", "http://foo@[::1]",
-
-    "mms://sm02.tsqa.example.com/0102rally.asf", "pnm://foo:bar@some.place:80/path;params?query#fragment",
-    "rtsp://foo:bar@some.place:80/path;params?query#fragment", "rtspu://foo:bar@some.place:80/path;params?query#fragment",
+    "http://172.16.28.101",
+    "http://172.16.28.101:8080",
+    "http://[::]",
+    "http://[::1]",
+    "http://[fc01:172:16:28::101]",
+    "http://[fc01:172:16:28::101]:80",
+    "http://[fc01:172:16:28:BAAD:BEEF:DEAD:101]",
+    "http://[fc01:172:16:28:BAAD:BEEF:DEAD:101]:8080",
+    "http://172.16.28.101/some/path",
+    "http://172.16.28.101:8080/some/path",
+    "http://[::1]/some/path",
+    "http://[fc01:172:16:28::101]/some/path",
+    "http://[fc01:172:16:28::101]:80/some/path",
+    "http://[fc01:172:16:28:BAAD:BEEF:DEAD:101]/some/path",
+    "http://[fc01:172:16:28:BAAD:BEEF:DEAD:101]:8080/some/path",
+    "http://172.16.28.101/",
+    "http://[fc01:172:16:28:BAAD:BEEF:DEAD:101]:8080/",
+
+    // "foo:@some.place", TODO - foo:@some.place is change to foo@some.place in the test
+    "foo:bar@some.place",
+    "foo:bar@some.place/",
+    "http://foo:bar@some.place",
+    "http://foo:bar@some.place/",
+    "http://foo:bar@[::1]:8080/",
+    "http://foo@[::1]",
+
+    "mms://sm02.tsqa.example.com/0102rally.asf",
+    "pnm://foo:bar@some.place:80/path;params?query#fragment",
+    "rtsp://foo:bar@some.place:80/path;params?query#fragment",
+    "rtspu://foo:bar@some.place:80/path;params?query#fragment",
     "/finance/external/cbsm/*http://cbs.marketwatch.com/archive/19990713/news/current/net.htx?source=blq/yhoo&dist=yhoo",
-    "http://a.b.com/xx.jpg?newpath=http://bob.dave.com"};
+    "http://a.b.com/xx.jpg?newpath=http://bob.dave.com",
+
+    "ht-tp://a.b.com",
+    "ht+tp://a.b.com",
+    "ht.tp://a.b.com",
+
+    "h1ttp://a.b.com",
+    "http1://a.b.com",
+  };
 
   static const char *bad[] = {
     "http://[1:2:3:4:5:6:7:8:9]",
     "http://1:2:3:4:5:6:7:8:A:B",
     "http://bob.com[::1]",
     "http://[::1].com",
+
     "http://foo:bar:baz@bob.com/",
     "http://foo:bar:baz@[::1]:8080/",
+
     "http://]",
     "http://:",
+
     "http:/",
+    "http:/foo.bar.com/",
+    "~http://invalid.char.in.scheme/foo",
+    "http~://invalid.char.in.scheme/foo",
+    "ht~tp://invalid.char.in.scheme/foo",
+    "1http://first.char.not.alpha",
+    "some.domain.com/http://invalid.domain/foo",
+    ":",
+    "://",
+
+    // maybe this should be a valid URL
+    "a.b.com/xx.jpg?newpath=http://bob.dave.com",
   };
 
   int err, failed;
@@ -400,6 +456,7 @@ HdrTest::test_url()
     url.create(nullptr);
     err = url.parse(&start, end);
     if (err < 0) {
+      printf("Failed to parse url '%s'\n", start);
       failed = 1;
       break;
     }
@@ -448,6 +505,8 @@ HdrTest::test_url()
       failed = 1;
       printf("Successfully parsed invalid url '%s'", x);
       break;
+    } else {
+      printf("   bad URL - PARSE FAILED: '%s'\n", bad[i]);
     }
   }
 
diff --git a/proxy/hdrs/URL.cc b/proxy/hdrs/URL.cc
index 8f94329..d38a1f5 100644
--- a/proxy/hdrs/URL.cc
+++ b/proxy/hdrs/URL.cc
@@ -1120,34 +1120,41 @@ url_parse_scheme(HdrHeap *heap, URLImpl *url, const char **start, const char *en
   const char *scheme_end   = nullptr;
   int scheme_wks_idx;
 
+  // Skip over spaces
   while (' ' == *cur && ++cur < end) {
-    ;
   }
+
   if (cur < end) {
     scheme_start = scheme_end = cur;
-    // special case 'http:' for performance
-    if ((end - cur >= 5) && (((cur[0] ^ 'h') | (cur[1] ^ 't') | (cur[2] ^ 't') | (cur[3] ^ 'p') | (cur[4] ^ ':')) == 0)) {
-      scheme_end = cur + 4; // point to colon
-      url_scheme_set(heap, url, scheme_start, URL_WKSIDX_HTTP, 4, copy_strings_p);
-    } else if ('/' != *cur) {
-      // For forward transparent mode, the URL for the method can just be a path,
-      // so don't scan that for a scheme, as we could find a false positive if there
-      // is a URL in the parameters (which is legal).
+
+    // If the URL is more complex then a path, parse to see if there is a scheme
+    if ('/' != *cur) {
+      // Search for a : it could be part of a scheme or a username:password
       while (':' != *cur && ++cur < end) {
-        ;
       }
-      if (cur < end) { // found a colon
-        scheme_wks_idx = hdrtoken_tokenize(scheme_start, cur - scheme_start, &scheme_wks);
-
-        /*  Distinguish between a scheme only and a username by looking past the colon. If it is missing
-            or it's a slash, presume scheme. Otherwise it's a username with a password.
-        */
-        if ((scheme_wks_idx > 0 && hdrtoken_wks_to_token_type(scheme_wks) == HDRTOKEN_TYPE_SCHEME) || // known scheme
-            (cur >= end - 1 || cur[1] == '/')) // no more data or slash past colon
-        {
-          scheme_end = cur;
-          url_scheme_set(heap, url, scheme_start, scheme_wks_idx, scheme_end - scheme_start, copy_strings_p);
+
+      // If there is a :// then there is a scheme
+      if (cur + 2 < end && cur[1] == '/' && cur[2] == '/') { // found "://"
+        scheme_end     = cur;
+        scheme_wks_idx = hdrtoken_tokenize(scheme_start, scheme_end - scheme_start, &scheme_wks);
+
+        if (!(scheme_wks_idx > 0 && hdrtoken_wks_to_token_type(scheme_wks) == HDRTOKEN_TYPE_SCHEME)) {
+          // Unknown scheme, validate the scheme
+
+          // RFC 3986 Section 3.1
+          // These are the valid characters in a scheme:
+          //   scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
+          // return an error if there is another character in the scheme
+          if (!ParseRules::is_alpha(*scheme_start)) {
+            return PARSE_RESULT_ERROR;
+          }
+          for (cur = scheme_start + 1; cur < scheme_end; ++cur) {
+            if (!(ParseRules::is_alnum(*cur) != 0 || *cur == '+' || *cur == '-' || *cur == '.')) {
+              return PARSE_RESULT_ERROR;
+            }
+          }
         }
+        url_scheme_set(heap, url, scheme_start, scheme_wks_idx, scheme_end - scheme_start, copy_strings_p);
       }
     }
     *start = scheme_end;