You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2011/10/11 14:20:26 UTC
svn commit: r1181747 - in /nutch/trunk: ./ src/java/org/apache/nutch/util/
src/plugin/parse-html/
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/
src/plugin/parse-html/src/test/org/apache/nutch/parse/html/
src/plugin/parse-tika/src/java/org...
Author: ab
Date: Tue Oct 11 12:20:25 2011
New Revision: 1181747
URL: http://svn.apache.org/viewvc?rev=1181747&view=rev
Log:
NUTCH-797 Fix parse-tika and parse-html to use relative URL resolution per RFC-3986.
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
nutch/trunk/src/plugin/parse-html/ivy.xml
nutch/trunk/src/plugin/parse-html/plugin.xml
nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1181747&r1=1181746&r2=1181747&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Oct 11 12:20:25 2011
@@ -2,6 +2,9 @@ Nutch Change Log
Release 1.4 - Current development
+* NUTCH-797 Fix parse-tika and parse-html to use relative URL resolution per RFC-3986
+ (Robert Hohman, ab)
+
* NUTCH-1154 Upgrade to Tika 0.10. NOTE: Tika's new RTF parser may ignore more
text in malformed documents than previously - see TIKA-748 for details. (ab)
Modified: nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java?rev=1181747&r1=1181746&r2=1181747&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Tue Oct 11 12:20:25 2011
@@ -26,6 +26,108 @@ import org.apache.nutch.util.domain.Doma
/** Utility class for URL analysis */
public class URLUtil {
+
+ /**
+ * Resolve relative URL-s and fix a few java.net.URL errors
+ * in handling of URLs with embedded params and pure query
+ * targets.
+ * @param base base url
+ * @param target target url (may be relative)
+ * @return resolved absolute url.
+ * @throws MalformedURLException
+ */
+ public static URL resolveURL(URL base, String target)
+ throws MalformedURLException {
+ target = target.trim();
+
+ /* this is probably not needed anymore - see NUTCH-797.
+ // handle params that are embedded into the base url - move them to target
+ // so URL class constructs the new url class properly
+ if (base.toString().indexOf(';') > 0)
+ return fixEmbeddedParams(base, target);
+ */
+
+ // handle the case that there is a target that is a pure query,
+ // for example
+ // http://careers3.accenture.com/Careers/ASPX/Search.aspx?co=0&sk=0
+ // It has urls in the page of the form href="?co=0&sk=0&pg=1", and by
+ // default
+ // URL constructs the base+target combo as
+ // http://careers3.accenture.com/Careers/ASPX/?co=0&sk=0&pg=1, incorrectly
+ // dropping the Search.aspx target
+ //
+ // Browsers handle these just fine, they must have an exception similar to
+ // this
+ if (target.startsWith("?")) {
+ return fixPureQueryTargets(base, target);
+ }
+
+ return new URL(base, target);
+ }
+
+ /** Handle the case in RFC3986 section 5.4.1 example 7, and similar. */
+ static URL fixPureQueryTargets(URL base, String target)
+ throws MalformedURLException {
+ if (!target.startsWith("?")) return new URL(base, target);
+
+ String basePath = base.getPath();
+ String baseRightMost = "";
+ int baseRightMostIdx = basePath.lastIndexOf("/");
+ if (baseRightMostIdx != -1) {
+ baseRightMost = basePath.substring(baseRightMostIdx + 1);
+ }
+
+ if (target.startsWith("?")) target = baseRightMost + target;
+
+ return new URL(base, target);
+ }
+
+ /**
+ * Handles cases where the url param information is encoded into the base url
+ * as opposed to the target.
+ * <p>
+ * If the taget contains params (i.e. ';xxxx') information then the target
+ * params information is assumed to be correct and any base params information
+ * is ignored. If the base contains params information but the tareget does
+ * not, then the params information is moved to the target allowing it to be
+ * correctly determined by the java.net.URL class.
+ *
+ * @param base
+ * The base URL.
+ * @param target
+ * The target path from the base URL.
+ *
+ * @return URL A URL with the params information correctly encoded.
+ *
+ * @throws MalformedURLException
+ * If the url is not a well formed URL.
+ */
+ private static URL fixEmbeddedParams(URL base, String target)
+ throws MalformedURLException {
+
+ // the target contains params information or the base doesn't then no
+ // conversion necessary, return regular URL
+ if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
+ return new URL(base, target);
+ }
+
+ // get the base url and it params information
+ String baseURL = base.toString();
+ int startParams = baseURL.indexOf(';');
+ String params = baseURL.substring(startParams);
+
+ // if the target has a query string then put the params information after
+ // any path but before the query string, otherwise just append to the path
+ int startQS = target.indexOf('?');
+ if (startQS >= 0) {
+ target = target.substring(0, startQS) + params
+ + target.substring(startQS);
+ } else {
+ target += params;
+ }
+
+ return new URL(base, target);
+ }
private static Pattern IP_PATTERN = Pattern.compile("(\\d{1,3}\\.){3}(\\d{1,3})");
Modified: nutch/trunk/src/plugin/parse-html/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/ivy.xml?rev=1181747&r1=1181746&r2=1181747&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/ivy.xml (original)
+++ nutch/trunk/src/plugin/parse-html/ivy.xml Tue Oct 11 12:20:25 2011
@@ -36,7 +36,7 @@
</publications>
<dependencies>
- <dependency org="org.ccil.cowan.tagsoup" name="tagsoup" rev="1.2"/>
+ <dependency org="org.ccil.cowan.tagsoup" name="tagsoup" rev="1.2.1"/>
</dependencies>
</ivy-module>
Modified: nutch/trunk/src/plugin/parse-html/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/plugin.xml?rev=1181747&r1=1181746&r2=1181747&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/plugin.xml (original)
+++ nutch/trunk/src/plugin/parse-html/plugin.xml Tue Oct 11 12:20:25 2011
@@ -25,7 +25,7 @@
<library name="parse-html.jar">
<export name="*"/>
</library>
- <library name="tagsoup-1.2.jar"/>
+ <library name="tagsoup-1.2.1.jar"/>
</runtime>
<requires>
Modified: nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=1181747&r1=1181746&r2=1181747&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Tue Oct 11 12:20:25 2011
@@ -26,6 +26,7 @@ import java.util.Stack;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.util.NodeWalker;
+import org.apache.nutch.util.URLUtil;
import org.apache.hadoop.conf.Configuration;
import org.w3c.dom.*;
@@ -39,8 +40,6 @@ import org.w3c.dom.*;
*/
public class DOMContentUtils {
- private boolean fixEmbeddedParams;
-
public static class LinkParams {
public String elName;
public String attrName;
@@ -57,7 +56,7 @@ public class DOMContentUtils {
}
}
- private HashMap linkParams = new HashMap();
+ private HashMap<String,LinkParams> linkParams = new HashMap<String,LinkParams>();
private Configuration conf;
public DOMContentUtils(Configuration conf) {
@@ -89,9 +88,6 @@ public class DOMContentUtils {
if ( ! forceTags.contains(ignoreTags[i]) )
linkParams.remove(ignoreTags[i]);
}
-
- // https://issues.apache.org/jira/browse/NUTCH-1115
- fixEmbeddedParams = conf.getBoolean("parser.fix.embeddedparams", true);
}
/**
@@ -305,51 +301,6 @@ public class DOMContentUtils {
}
/**
- * Handles cases where the url param information is encoded into the base
- * url as opposed to the target.
- * <p>
- * If the taget contains params (i.e. ';xxxx') information then the target
- * params information is assumed to be correct and any base params information
- * is ignored. If the base contains params information but the tareget does
- * not, then the params information is moved to the target allowing it to be
- * correctly determined by the java.net.URL class.
- *
- * @param base The base URL.
- * @param target The target path from the base URL.
- *
- * @return URL A URL with the params information correctly encoded.
- *
- * @throws MalformedURLException If the url is not a well formed URL.
- */
- private URL fixEmbeddedParams(URL base, String target)
- throws MalformedURLException{
-
- // the target contains params information or the base doesn't then no
- // conversion necessary, return regular URL
- if (!fixEmbeddedParams || target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
- return new URL(base, target);
- }
-
- // get the base url and it params information
- String baseURL = base.toString();
- int startParams = baseURL.indexOf(';');
- String params = baseURL.substring(startParams);
-
- // if the target has a query string then put the params information after
- // any path but before the query string, otherwise just append to the path
- int startQS = target.indexOf('?');
- if (startQS >= 0) {
- target = target.substring(0, startQS) + params +
- target.substring(startQS);
- }
- else {
- target += params;
- }
-
- return new URL(base, target);
- }
-
- /**
* This method finds all anchors below the supplied DOM
* <code>node</code>, and creates appropriate {@link Outlink}
* records for each (relative to the supplied <code>base</code>
@@ -363,7 +314,7 @@ public class DOMContentUtils {
* nodes (this is a common DOM-fixup artifact, at least with
* nekohtml).
*/
- public void getOutlinks(URL base, ArrayList outlinks,
+ public void getOutlinks(URL base, ArrayList<Outlink> outlinks,
Node node) {
NodeWalker walker = new NodeWalker(node);
@@ -434,8 +385,7 @@ public class DOMContentUtils {
if (target != null && !noFollow && !post)
try {
- URL url = (base.toString().indexOf(';') > 0) ?
- fixEmbeddedParams(base, target) : new URL(base, target);
+ URL url = URLUtil.resolveURL(base, target);
outlinks.add(new Outlink(url.toString(),
linkText.toString().trim()));
} catch (MalformedURLException e) {
Modified: nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=1181747&r1=1181746&r2=1181747&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Tue Oct 11 12:20:25 2011
@@ -294,11 +294,12 @@ public class TestDOMContentUtils extends
new Outlink("http://www.nutch.org/g;x?y#s", "anchor3")
},
{
- new Outlink("http://www.nutch.org/g;something", "anchor1"),
- new Outlink("http://www.nutch.org/g;something?y#s", "anchor2"),
+ // this is tricky - see RFC3986 section 5.4.1 example 7
+ new Outlink("http://www.nutch.org/g", "anchor1"),
+ new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
- new Outlink("http://www.nutch.org/?y=1;somethingelse", "anchor5")
+ new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5")
},
{
new Outlink("http://www.nutch.org/g", ""),
Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java?rev=1181747&r1=1181746&r2=1181747&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java (original)
+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java Tue Oct 11 12:20:25 2011
@@ -26,6 +26,7 @@ import java.util.HashMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.util.NodeWalker;
+import org.apache.nutch.util.URLUtil;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
@@ -39,8 +40,6 @@ import org.w3c.dom.NodeList;
*/
class DOMContentUtils {
- private boolean fixEmbeddedParams;
-
private static class LinkParams {
private String elName;
private String attrName;
@@ -57,7 +56,7 @@ class DOMContentUtils {
}
}
- private HashMap linkParams = new HashMap();
+ private HashMap<String,LinkParams> linkParams = new HashMap<String,LinkParams>();
private Configuration conf;
DOMContentUtils(Configuration conf) {
@@ -89,9 +88,6 @@ class DOMContentUtils {
if ( ! forceTags.contains(ignoreTags[i]) )
linkParams.remove(ignoreTags[i]);
}
-
- // https://issues.apache.org/jira/browse/NUTCH-1115
- fixEmbeddedParams = conf.getBoolean("parser.fix.embeddedparams", true);
}
/**
@@ -305,51 +301,6 @@ class DOMContentUtils {
}
/**
- * Handles cases where the url param information is encoded into the base
- * url as opposed to the target.
- * <p>
- * If the taget contains params (i.e. ';xxxx') information then the target
- * params information is assumed to be correct and any base params information
- * is ignored. If the base contains params information but the tareget does
- * not, then the params information is moved to the target allowing it to be
- * correctly determined by the java.net.URL class.
- *
- * @param base The base URL.
- * @param target The target path from the base URL.
- *
- * @return URL A URL with the params information correctly encoded.
- *
- * @throws MalformedURLException If the url is not a well formed URL.
- */
- private URL fixEmbeddedParams(URL base, String target)
- throws MalformedURLException{
-
- // the target contains params information or the base doesn't then no
- // conversion necessary, return regular URL
- if (!fixEmbeddedParams || target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
- return new URL(base, target);
- }
-
- // get the base url and it params information
- String baseURL = base.toString();
- int startParams = baseURL.indexOf(';');
- String params = baseURL.substring(startParams);
-
- // if the target has a query string then put the params information after
- // any path but before the query string, otherwise just append to the path
- int startQS = target.indexOf('?');
- if (startQS >= 0) {
- target = target.substring(0, startQS) + params +
- target.substring(startQS);
- }
- else {
- target += params;
- }
-
- return new URL(base, target);
- }
-
- /**
* This method finds all anchors below the supplied DOM
* <code>node</code>, and creates appropriate {@link Outlink}
* records for each (relative to the supplied <code>base</code>
@@ -363,7 +314,7 @@ class DOMContentUtils {
* nodes (this is a common DOM-fixup artifact, at least with
* nekohtml).
*/
- void getOutlinks(URL base, ArrayList outlinks,
+ void getOutlinks(URL base, ArrayList<Outlink> outlinks,
Node node) {
NodeWalker walker = new NodeWalker(node);
@@ -405,8 +356,7 @@ class DOMContentUtils {
if (target != null && !noFollow && !post)
try {
- URL url = (base.toString().indexOf(';') > 0) ?
- fixEmbeddedParams(base, target) : new URL(base, target);
+ URL url = URLUtil.resolveURL(base, target);
outlinks.add(new Outlink(url.toString(),
linkText.toString().trim()));
} catch (MalformedURLException e) {
Modified: nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java?rev=1181747&r1=1181746&r2=1181747&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java Tue Oct 11 12:20:25 2011
@@ -213,4 +213,50 @@ public class TestURLUtil
assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aSubDotCom, true));
}
+ // from RFC3986 section 5.4.1
+ private static String baseString = "http://a/b/c/d;p?q";
+ private static String[][] targets = new String[][] {
+ // unknown protocol {"g:h" , "g:h"},
+ {"g" , "http://a/b/c/g"},
+ { "./g" , "http://a/b/c/g"},
+ { "g/" , "http://a/b/c/g/"},
+ { "/g" , "http://a/g"},
+ { "//g" , "http://g"},
+ { "?y" , "http://a/b/c/d;p?y"},
+ { "g?y" , "http://a/b/c/g?y"},
+ { "#s" , "http://a/b/c/d;p?q#s"},
+ { "g#s" , "http://a/b/c/g#s"},
+ { "g?y#s" , "http://a/b/c/g?y#s"},
+ { ";x" , "http://a/b/c/;x"},
+ { "g;x" , "http://a/b/c/g;x"},
+ { "g;x?y#s" , "http://a/b/c/g;x?y#s"},
+ { "" , "http://a/b/c/d;p?q"},
+ { "." , "http://a/b/c/"},
+ { "./" , "http://a/b/c/"},
+ { ".." , "http://a/b/"},
+ { "../" , "http://a/b/"},
+ { "../g" , "http://a/b/g"},
+ { "../.." , "http://a/"},
+ { "../../" , "http://a/"},
+ { "../../g" , "http://a/g"}
+ };
+
+ public void testResolveURL() throws Exception {
+ // test NUTCH-436
+ URL u436 = new URL("http://a/b/c/d;p?q#f");
+ assertEquals("http://a/b/c/d;p?q#f", u436.toString());
+ URL abs = URLUtil.resolveURL(u436, "?y");
+ assertEquals("http://a/b/c/d;p?y", abs.toString());
+ // test NUTCH-566
+ URL u566 = new URL("http://www.fleurie.org/entreprise.asp");
+ abs = URLUtil.resolveURL(u566, "?id_entrep=111");
+ assertEquals("http://www.fleurie.org/entreprise.asp?id_entrep=111", abs.toString());
+ URL base = new URL(baseString);
+ assertEquals("base url parsing", baseString, base.toString());
+ for (int i = 0; i < targets.length; i++) {
+ URL u = URLUtil.resolveURL(base, targets[i][0]);
+ assertEquals(targets[i][1], targets[i][1], u.toString());
+ }
+ }
+
}