You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2016/12/13 13:32:22 UTC

nutch git commit: NUTCH-2337 urlnormalizer-basic to strip empty port - make sure that URLs which contain anything else than the host in the authority (incl. empty port) are marked as changed

Repository: nutch
Updated Branches:
  refs/heads/2.x 0ea78907d -> 6e3c34db1


NUTCH-2337 urlnormalizer-basic to strip empty port
- make sure that URLs which contain anything else than the host
  in the authority (incl. empty port) are marked as changed


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/6e3c34db
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/6e3c34db
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/6e3c34db

Branch: refs/heads/2.x
Commit: 6e3c34db16e385b0dadbe6444c2685283c863350
Parents: 0ea7890
Author: Sebastian Nagel <sn...@apache.org>
Authored: Tue Dec 13 14:27:55 2016 +0100
Committer: Sebastian Nagel <sn...@apache.org>
Committed: Tue Dec 13 14:29:31 2016 +0100

----------------------------------------------------------------------
 .../nutch/net/urlnormalizer/basic/BasicURLNormalizer.java       | 4 ++++
 .../nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java   | 5 ++++-
 2 files changed, 8 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/6e3c34db/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
----------------------------------------------------------------------
diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index 3652d47..b648293 100644
--- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -83,6 +83,10 @@ public class BasicURLNormalizer extends Configured implements URLNormalizer {
         if (!host.equals(newHost)) {
           host = newHost;
           changed = true;
+        } else if (!url.getAuthority().equals(newHost)) {
+          // authority (http://<...>/) contains other elements (port, user,
+          // etc.) which will likely cause a change if left away
+          changed = true;
         }
       }
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/6e3c34db/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
----------------------------------------------------------------------
diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index 0974b49..006c1a3 100644
--- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -52,6 +52,10 @@ public class TestBasicURLNormalizer {
     // check that port number is normalized
     normalizeTest("http://foo.com:80/index.html", "http://foo.com/index.html");
     normalizeTest("http://foo.com:81/", "http://foo.com:81/");
+    // check that empty port is removed
+    normalizeTest("http://example.com:/", "http://example.com/");
+    normalizeTest("https://example.com:/foobar.html",
+        "https://example.com/foobar.html");
 
     // check that null path is normalized
     normalizeTest("http://foo.com", "http://foo.com/");
@@ -63,7 +67,6 @@ public class TestBasicURLNormalizer {
     // normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
 
     // check that unnecessary "../" are removed
-
     normalizeTest("http://foo.com/aa/./foo.html", "http://foo.com/aa/foo.html");
     normalizeTest("http://foo.com/aa/../", "http://foo.com/");
     normalizeTest("http://foo.com/aa/bb/../", "http://foo.com/aa/");