You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2017/02/01 11:07:30 UTC

nutch git commit: NUTCH-2349 urlnormalizer-basic: NPE for URLs without authority - check whether URL.getAuthority() returns null - recompose URLs without authority with empty authority/host

Repository: nutch
Updated Branches:
  refs/heads/2.x 022ed5c03 -> 700857d16


NUTCH-2349 urlnormalizer-basic: NPE for URLs without authority
- check whether URL.getAuthority() returns null
- recompose URLs without authority with empty authority/host


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/700857d1
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/700857d1
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/700857d1

Branch: refs/heads/2.x
Commit: 700857d16c9e1517ddb9868ed41171d91e5c9116
Parents: 022ed5c
Author: Sebastian Nagel <sn...@apache.org>
Authored: Wed Feb 1 11:51:04 2017 +0100
Committer: Sebastian Nagel <sn...@apache.org>
Committed: Wed Feb 1 11:51:04 2017 +0100

----------------------------------------------------------------------
 .../nutch/net/urlnormalizer/basic/BasicURLNormalizer.java      | 5 ++++-
 .../nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java  | 6 ++++++
 2 files changed, 10 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/700857d1/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
----------------------------------------------------------------------
diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index e17b19a..15a1de0 100644
--- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -79,7 +79,7 @@ public class BasicURLNormalizer extends Configured implements URLNormalizer {
     if ("http".equals(protocol) || "https".equals(protocol)
         || "ftp".equals(protocol)) {
 
-      if (host != null) {
+      if (host != null && url.getAuthority() != null) {
         String newHost = host.toLowerCase(Locale.ROOT); // lowercase host
         if (!host.equals(newHost)) {
           host = newHost;
@@ -89,6 +89,9 @@ public class BasicURLNormalizer extends Configured implements URLNormalizer {
           // etc.) which will likely cause a change if left away
           changed = true;
         }
+      } else {
+        // no host or authority: recompose the URL from components
+        changed = true;
       }
 
       if (port == url.getDefaultPort()) { // uses default port

http://git-wip-us.apache.org/repos/asf/nutch/blob/700857d1/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
----------------------------------------------------------------------
diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index 006c1a3..1d5d99e 100644
--- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -100,6 +100,12 @@ public class TestBasicURLNormalizer {
         "http://foo.com/aa/bb/foo.html");
     normalizeTest("http://foo.com/aa?referer=http://bar.com",
         "http://foo.com/aa?referer=http://bar.com");
+    // check for NPEs when normalizing URLs without host (authority)
+    normalizeTest("file:///foo/bar.txt", "file:///foo/bar.txt");
+    normalizeTest("ftp:/", "ftp:/");
+    normalizeTest("http:", "http:/");
+    normalizeTest("http:////", "http:/");
+    normalizeTest("http:///////", "http:/");
   }
 
   private void normalizeTest(String weird, String normal) throws Exception {