You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2016/12/13 13:32:22 UTC
nutch git commit: NUTCH-2337 urlnormalizer-basic to strip empty port
- make sure that URLs which contain anything else than the host in the
authority (incl. empty port) are marked as changed
Repository: nutch
Updated Branches:
refs/heads/2.x 0ea78907d -> 6e3c34db1
NUTCH-2337 urlnormalizer-basic to strip empty port
- make sure that URLs which contain anything else than the host
in the authority (incl. empty port) are marked as changed
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/6e3c34db
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/6e3c34db
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/6e3c34db
Branch: refs/heads/2.x
Commit: 6e3c34db16e385b0dadbe6444c2685283c863350
Parents: 0ea7890
Author: Sebastian Nagel <sn...@apache.org>
Authored: Tue Dec 13 14:27:55 2016 +0100
Committer: Sebastian Nagel <sn...@apache.org>
Committed: Tue Dec 13 14:29:31 2016 +0100
----------------------------------------------------------------------
.../nutch/net/urlnormalizer/basic/BasicURLNormalizer.java | 4 ++++
.../nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java | 5 ++++-
2 files changed, 8 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/6e3c34db/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
----------------------------------------------------------------------
diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index 3652d47..b648293 100644
--- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -83,6 +83,10 @@ public class BasicURLNormalizer extends Configured implements URLNormalizer {
if (!host.equals(newHost)) {
host = newHost;
changed = true;
+ } else if (!url.getAuthority().equals(newHost)) {
+ // authority (http://<...>/) contains other elements (port, user,
+ // etc.) which will likely cause a change if left away
+ changed = true;
}
}
http://git-wip-us.apache.org/repos/asf/nutch/blob/6e3c34db/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
----------------------------------------------------------------------
diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index 0974b49..006c1a3 100644
--- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -52,6 +52,10 @@ public class TestBasicURLNormalizer {
// check that port number is normalized
normalizeTest("http://foo.com:80/index.html", "http://foo.com/index.html");
normalizeTest("http://foo.com:81/", "http://foo.com:81/");
+ // check that empty port is removed
+ normalizeTest("http://example.com:/", "http://example.com/");
+ normalizeTest("https://example.com:/foobar.html",
+ "https://example.com/foobar.html");
// check that null path is normalized
normalizeTest("http://foo.com", "http://foo.com/");
@@ -63,7 +67,6 @@ public class TestBasicURLNormalizer {
// normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
// check that unnecessary "../" are removed
-
normalizeTest("http://foo.com/aa/./foo.html", "http://foo.com/aa/foo.html");
normalizeTest("http://foo.com/aa/../", "http://foo.com/");
normalizeTest("http://foo.com/aa/bb/../", "http://foo.com/aa/");