You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2017/02/01 11:07:30 UTC
nutch git commit: NUTCH-2349 urlnormalizer-basic: NPE for URLs
without authority - check whether URL.getAuthority() returns null - recompose
URLs without authority with empty authority/host
Repository: nutch
Updated Branches:
refs/heads/2.x 022ed5c03 -> 700857d16
NUTCH-2349 urlnormalizer-basic: NPE for URLs without authority
- check whether URL.getAuthority() returns null
- recompose URLs without authority with empty authority/host
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/700857d1
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/700857d1
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/700857d1
Branch: refs/heads/2.x
Commit: 700857d16c9e1517ddb9868ed41171d91e5c9116
Parents: 022ed5c
Author: Sebastian Nagel <sn...@apache.org>
Authored: Wed Feb 1 11:51:04 2017 +0100
Committer: Sebastian Nagel <sn...@apache.org>
Committed: Wed Feb 1 11:51:04 2017 +0100
----------------------------------------------------------------------
.../nutch/net/urlnormalizer/basic/BasicURLNormalizer.java | 5 ++++-
.../nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java | 6 ++++++
2 files changed, 10 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/700857d1/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
----------------------------------------------------------------------
diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index e17b19a..15a1de0 100644
--- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -79,7 +79,7 @@ public class BasicURLNormalizer extends Configured implements URLNormalizer {
if ("http".equals(protocol) || "https".equals(protocol)
|| "ftp".equals(protocol)) {
- if (host != null) {
+ if (host != null && url.getAuthority() != null) {
String newHost = host.toLowerCase(Locale.ROOT); // lowercase host
if (!host.equals(newHost)) {
host = newHost;
@@ -89,6 +89,9 @@ public class BasicURLNormalizer extends Configured implements URLNormalizer {
// etc.) which will likely cause a change if left away
changed = true;
}
+ } else {
+ // no host or authority: recompose the URL from components
+ changed = true;
}
if (port == url.getDefaultPort()) { // uses default port
http://git-wip-us.apache.org/repos/asf/nutch/blob/700857d1/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
----------------------------------------------------------------------
diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index 006c1a3..1d5d99e 100644
--- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -100,6 +100,12 @@ public class TestBasicURLNormalizer {
"http://foo.com/aa/bb/foo.html");
normalizeTest("http://foo.com/aa?referer=http://bar.com",
"http://foo.com/aa?referer=http://bar.com");
+ // check for NPEs when normalizing URLs without host (authority)
+ normalizeTest("file:///foo/bar.txt", "file:///foo/bar.txt");
+ normalizeTest("ftp:/", "ftp:/");
+ normalizeTest("http:", "http:/");
+ normalizeTest("http:////", "http:/");
+ normalizeTest("http:///////", "http:/");
}
private void normalizeTest(String weird, String normal) throws Exception {