You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/04/28 23:09:02 UTC
svn commit: r1590796 - in /nutch/branches/2.x: CHANGES.txt
src/java/org/apache/nutch/util/URLUtil.java
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
src/test/org/apache/nutch/util/TestURLUtil.java
Author: snagel
Date: Mon Apr 28 21:09:01 2014
New Revision: 1590796
URL: http://svn.apache.org/r1590796
Log:
NUTCH-797 URL not properly constructed when link target begins with a "?"
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1590796&r1=1590795&r2=1590796&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Apr 28 21:09:01 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-797 URL not properly constructed when link target begins with a "?" (Doug Cook, Robert Hohman, Stondet, ab via snagel)
+
* NUTCH-1759 Upgrade to Crawler Commons 0.4 (jnioche)
* NUTCH-1700 Remove deprecated code in src/plugin/creativecommons/build.xml (lewismc)
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java?rev=1590796&r1=1590795&r2=1590796&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java Mon Apr 28 21:09:01 2014
@@ -27,6 +27,54 @@ import org.apache.nutch.util.domain.Doma
/** Utility class for URL analysis */
public class URLUtil {
+ /**
+ * Resolve relative URL-s and fix a few java.net.URL errors
+ * in handling of URLs with embedded params and pure query
+ * targets.
+ * @param base base url
+ * @param target target url (may be relative)
+ * @return resolved absolute url.
+ * @throws MalformedURLException
+ */
+ public static URL resolveURL(URL base, String target)
+ throws MalformedURLException {
+ target = target.trim();
+
+ // handle the case that there is a target that is a pure query,
+ // for example
+ // http://careers3.accenture.com/Careers/ASPX/Search.aspx?co=0&sk=0
+ // It has urls in the page of the form href="?co=0&sk=0&pg=1", and by
+ // default
+ // URL constructs the base+target combo as
+ // http://careers3.accenture.com/Careers/ASPX/?co=0&sk=0&pg=1, incorrectly
+ // dropping the Search.aspx target
+ //
+ // Browsers handle these just fine, they must have an exception similar to
+ // this
+ if (target.startsWith("?")) {
+ return fixPureQueryTargets(base, target);
+ }
+
+ return new URL(base, target);
+ }
+
+ /** Handle the case in RFC3986 section 5.4.1 example 7, and similar. */
+ static URL fixPureQueryTargets(URL base, String target)
+ throws MalformedURLException {
+ if (!target.startsWith("?")) return new URL(base, target);
+
+ String basePath = base.getPath();
+ String baseRightMost = "";
+ int baseRightMostIdx = basePath.lastIndexOf("/");
+ if (baseRightMostIdx != -1) {
+ baseRightMost = basePath.substring(baseRightMostIdx + 1);
+ }
+
+ if (target.startsWith("?")) target = baseRightMost + target;
+
+ return new URL(base, target);
+ }
+
private static Pattern IP_PATTERN = Pattern.compile("(\\d{1,3}\\.){3}(\\d{1,3})");
/** Returns the domain name of the url. The domain name of a url is
Modified: nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=1590796&r1=1590795&r2=1590796&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Mon Apr 28 21:09:01 2014
@@ -25,6 +25,7 @@ import java.util.HashMap;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.util.NodeWalker;
+import org.apache.nutch.util.URLUtil;
import org.apache.hadoop.conf.Configuration;
import org.w3c.dom.*;
@@ -338,7 +339,7 @@ public class DOMContentUtils {
target += params;
}
- return new URL(base, target);
+ return URLUtil.resolveURL(base, target);
}
/**
@@ -398,7 +399,7 @@ public class DOMContentUtils {
try {
URL url = (base.toString().indexOf(';') > 0) ?
- fixEmbeddedParams(base, target) : new URL(base, target);
+ fixEmbeddedParams(base, target) : URLUtil.resolveURL(base, target);
outlinks.add(new Outlink(url.toString(),
linkText.toString().trim()));
} catch (MalformedURLException e) {
Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java?rev=1590796&r1=1590795&r2=1590796&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java Mon Apr 28 21:09:01 2014
@@ -211,6 +211,53 @@ public class TestURLUtil {
assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aSubDotCom, true));
}
+ // from RFC3986 section 5.4.1
+ private static String baseString = "http://a/b/c/d;p?q";
+ private static String[][] targets = new String[][] {
+ // unknown protocol {"g:h" , "g:h"},
+ {"g" , "http://a/b/c/g"},
+ { "./g" , "http://a/b/c/g"},
+ { "g/" , "http://a/b/c/g/"},
+ { "/g" , "http://a/g"},
+ { "//g" , "http://g"},
+ { "?y" , "http://a/b/c/d;p?y"},
+ { "g?y" , "http://a/b/c/g?y"},
+ { "#s" , "http://a/b/c/d;p?q#s"},
+ { "g#s" , "http://a/b/c/g#s"},
+ { "g?y#s" , "http://a/b/c/g?y#s"},
+ { ";x" , "http://a/b/c/;x"},
+ { "g;x" , "http://a/b/c/g;x"},
+ { "g;x?y#s" , "http://a/b/c/g;x?y#s"},
+ { "" , "http://a/b/c/d;p?q"},
+ { "." , "http://a/b/c/"},
+ { "./" , "http://a/b/c/"},
+ { ".." , "http://a/b/"},
+ { "../" , "http://a/b/"},
+ { "../g" , "http://a/b/g"},
+ { "../.." , "http://a/"},
+ { "../../" , "http://a/"},
+ { "../../g" , "http://a/g"}
+ };
+
+ @Test
+ public void testResolveURL() throws Exception {
+ // test NUTCH-436
+ URL u436 = new URL("http://a/b/c/d;p?q#f");
+ assertEquals("http://a/b/c/d;p?q#f", u436.toString());
+ URL abs = URLUtil.resolveURL(u436, "?y");
+ assertEquals("http://a/b/c/d;p?y", abs.toString());
+ // test NUTCH-566
+ URL u566 = new URL("http://www.fleurie.org/entreprise.asp");
+ abs = URLUtil.resolveURL(u566, "?id_entrep=111");
+ assertEquals("http://www.fleurie.org/entreprise.asp?id_entrep=111", abs.toString());
+ URL base = new URL(baseString);
+ assertEquals("base url parsing", baseString, base.toString());
+ for (int i = 0; i < targets.length; i++) {
+ URL u = URLUtil.resolveURL(base, targets[i][0]);
+ assertEquals(targets[i][1], targets[i][1], u.toString());
+ }
+ }
+
@Test
public void testToUNICODE() throws Exception {
assertEquals("http://www.çevir.com", URLUtil.toUNICODE("http://www.xn--evir-zoa.com"));