You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/06/21 00:56:33 UTC
svn commit: r1604298 - in /nutch: branches/2.x/
branches/2.x/src/java/org/apache/nutch/util/
branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/
branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/
branches/2.x/...
Author: snagel
Date: Fri Jun 20 22:56:32 2014
New Revision: 1604298
URL: http://svn.apache.org/r1604298
Log:
NUTCH-1767 remove special treatment of "params" in relative links
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1604298&r1=1604297&r2=1604298&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Jun 20 22:56:32 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1767 remove special treatment of "params" in relative links (snagel)
+
* NUTCH-1718 redefine http.robots.agent as "additional agent names" (snagel, Tejas Patil, Daniel Kugel)
* NUTCH-1796 Ensure Gora object builders are used as oppose to empty constructors (snagel via lewismc)
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java?rev=1604298&r1=1604297&r2=1604298&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java Fri Jun 20 22:56:32 2014
@@ -28,9 +28,8 @@ import org.apache.nutch.util.domain.Doma
public class URLUtil {
/**
- * Resolve relative URL-s and fix a few java.net.URL errors
- * in handling of URLs with embedded params and pure query
- * targets.
+ * Resolve relative URL-s and fix a java.net.URL error
+ * in handling of URLs with pure query targets.
* @param base base url
* @param target target url (may be relative)
* @return resolved absolute url.
Modified: nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=1604298&r1=1604297&r2=1604298&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Fri Jun 20 22:56:32 2014
@@ -298,51 +298,6 @@ public class DOMContentUtils {
}
/**
- * Handles cases where the url param information is encoded into the base
- * url as opposed to the target.
- * <p>
- * If the taget contains params (i.e. ';xxxx') information then the target
- * params information is assumed to be correct and any base params information
- * is ignored. If the base contains params information but the tareget does
- * not, then the params information is moved to the target allowing it to be
- * correctly determined by the java.net.URL class.
- *
- * @param base The base URL.
- * @param target The target path from the base URL.
- *
- * @return URL A URL with the params information correctly encoded.
- *
- * @throws MalformedURLException If the url is not a well formed URL.
- */
- private URL fixEmbeddedParams(URL base, String target)
- throws MalformedURLException{
-
- // the target contains params information or the base doesn't then no
- // conversion necessary, return regular URL
- if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
- return new URL(base, target);
- }
-
- // get the base url and it params information
- String baseURL = base.toString();
- int startParams = baseURL.indexOf(';');
- String params = baseURL.substring(startParams);
-
- // if the target has a query string then put the params information after
- // any path but before the query string, otherwise just append to the path
- int startQS = target.indexOf('?');
- if (startQS >= 0) {
- target = target.substring(0, startQS) + params +
- target.substring(startQS);
- }
- else {
- target += params;
- }
-
- return URLUtil.resolveURL(base, target);
- }
-
- /**
* This method finds all anchors below the supplied DOM
* <code>node</code>, and creates appropriate {@link Outlink}
* records for each (relative to the supplied <code>base</code>
@@ -397,9 +352,8 @@ public class DOMContentUtils {
}
if (target != null && !noFollow && !post)
try {
-
- URL url = (base.toString().indexOf(';') > 0) ?
- fixEmbeddedParams(base, target) : URLUtil.resolveURL(base, target);
+
+ URL url = URLUtil.resolveURL(base, target);
outlinks.add(new Outlink(url.toString(),
linkText.toString().trim()));
} catch (MalformedURLException e) {
Modified: nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=1604298&r1=1604297&r2=1604298&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Fri Jun 20 22:56:32 2014
@@ -289,11 +289,12 @@ public class TestDOMContentUtils {
new Outlink("http://www.nutch.org/g;x?y#s", "anchor3")
},
{
- new Outlink("http://www.nutch.org/g;something", "anchor1"),
- new Outlink("http://www.nutch.org/g;something?y#s", "anchor2"),
+ // this is tricky - see RFC3986 section 5.4.1 example 7
+ new Outlink("http://www.nutch.org/g", "anchor1"),
+ new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
- new Outlink("http://www.nutch.org/?y=1;somethingelse", "anchor5")
+ new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5")
}
};
Modified: nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java?rev=1604298&r1=1604297&r2=1604298&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java (original)
+++ nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java Fri Jun 20 22:56:32 2014
@@ -26,6 +26,7 @@ import java.util.HashMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.util.NodeWalker;
+import org.apache.nutch.util.URLUtil;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
@@ -298,51 +299,6 @@ public class DOMContentUtils {
}
/**
- * Handles cases where the url param information is encoded into the base
- * url as opposed to the target.
- * <p>
- * If the taget contains params (i.e. ';xxxx') information then the target
- * params information is assumed to be correct and any base params information
- * is ignored. If the base contains params information but the tareget does
- * not, then the params information is moved to the target allowing it to be
- * correctly determined by the java.net.URL class.
- *
- * @param base The base URL.
- * @param target The target path from the base URL.
- *
- * @return URL A URL with the params information correctly encoded.
- *
- * @throws MalformedURLException If the url is not a well formed URL.
- */
- private URL fixEmbeddedParams(URL base, String target)
- throws MalformedURLException{
-
- // the target contains params information or the base doesn't then no
- // conversion necessary, return regular URL
- if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
- return new URL(base, target);
- }
-
- // get the base url and it params information
- String baseURL = base.toString();
- int startParams = baseURL.indexOf(';');
- String params = baseURL.substring(startParams);
-
- // if the target has a query string then put the params information after
- // any path but before the query string, otherwise just append to the path
- int startQS = target.indexOf('?');
- if (startQS >= 0) {
- target = target.substring(0, startQS) + params +
- target.substring(startQS);
- }
- else {
- target += params;
- }
-
- return new URL(base, target);
- }
-
- /**
* This method finds all anchors below the supplied DOM
* <code>node</code>, and creates appropriate {@link Outlink}
* records for each (relative to the supplied <code>base</code>
@@ -397,9 +353,8 @@ public class DOMContentUtils {
}
if (target != null && !noFollow && !post)
try {
-
- URL url = (base.toString().indexOf(';') > 0) ?
- fixEmbeddedParams(base, target) : new URL(base, target);
+
+ URL url = URLUtil.resolveURL(base, target);
outlinks.add(new Outlink(url.toString(),
linkText.toString().trim()));
} catch (MalformedURLException e) {
Modified: nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java?rev=1604298&r1=1604297&r2=1604298&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java (original)
+++ nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java Fri Jun 20 22:56:32 2014
@@ -285,11 +285,12 @@ public class DOMContentUtilsTest {
},
// 11
{
- new Outlink("http://www.nutch.org/g;something","anchor1"),
- new Outlink("http://www.nutch.org/g;something?y#s", "anchor2"),
+ // this is tricky - see RFC3986 section 5.4.1 example 7
+ new Outlink("http://www.nutch.org/g","anchor1"),
+ new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
- new Outlink("http://www.nutch.org/?y=1;somethingelse", "anchor5") }
+ new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5") }
};
}
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1604298&r1=1604297&r2=1604298&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jun 20 22:56:32 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development
+* NUTCH-1767 remove special treatment of "params" in relative links (snagel)
+
* NUTCH-1718 redefine http.robots.agent as "additional agent names" (snagel, Tejas Patil, Daniel Kugel)
* NUTCH-1794 IndexingFilterChecker to optionally dumpText (markus)
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1604298&r1=1604297&r2=1604298&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Jun 20 22:56:32 2014
@@ -1128,13 +1128,6 @@
</property>
<property>
- <name>parser.fix.embeddedparams</name>
- <value>true</value>
- <description>Whether to fix URL embedded params using semi-colons.
- See NUTCH-436 and NUTCH-1115</description>
-</property>
-
-<property>
<name>htmlparsefilter.order</name>
<value></value>
<description>The order by which HTMLParse filters are applied.
Modified: nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java?rev=1604298&r1=1604297&r2=1604298&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Fri Jun 20 22:56:32 2014
@@ -28,9 +28,8 @@ import org.apache.nutch.util.domain.Doma
public class URLUtil {
/**
- * Resolve relative URL-s and fix a few java.net.URL errors
- * in handling of URLs with embedded params and pure query
- * targets.
+ * Resolve relative URL-s and fix a java.net.URL error
+ * in handling of URLs with pure query targets.
* @param base base url
* @param target target url (may be relative)
* @return resolved absolute url.
@@ -40,13 +39,6 @@ public class URLUtil {
throws MalformedURLException {
target = target.trim();
- /* this is probably not needed anymore - see NUTCH-797.
- // handle params that are embedded into the base url - move them to target
- // so URL class constructs the new url class properly
- if (base.toString().indexOf(';') > 0)
- return fixEmbeddedParams(base, target);
- */
-
// handle the case that there is a target that is a pure query,
// for example
// http://careers3.accenture.com/Careers/ASPX/Search.aspx?co=0&sk=0
@@ -82,53 +74,6 @@ public class URLUtil {
return new URL(base, target);
}
- /**
- * Handles cases where the url param information is encoded into the base url
- * as opposed to the target.
- * <p>
- * If the taget contains params (i.e. ';xxxx') information then the target
- * params information is assumed to be correct and any base params information
- * is ignored. If the base contains params information but the tareget does
- * not, then the params information is moved to the target allowing it to be
- * correctly determined by the java.net.URL class.
- *
- * @param base
- * The base URL.
- * @param target
- * The target path from the base URL.
- *
- * @return URL A URL with the params information correctly encoded.
- *
- * @throws MalformedURLException
- * If the url is not a well formed URL.
- */
- private static URL fixEmbeddedParams(URL base, String target)
- throws MalformedURLException {
-
- // the target contains params information or the base doesn't then no
- // conversion necessary, return regular URL
- if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
- return new URL(base, target);
- }
-
- // get the base url and it params information
- String baseURL = base.toString();
- int startParams = baseURL.indexOf(';');
- String params = baseURL.substring(startParams);
-
- // if the target has a query string then put the params information after
- // any path but before the query string, otherwise just append to the path
- int startQS = target.indexOf('?');
- if (startQS >= 0) {
- target = target.substring(0, startQS) + params
- + target.substring(startQS);
- } else {
- target += params;
- }
-
- return new URL(base, target);
- }
-
private static Pattern IP_PATTERN = Pattern.compile("(\\d{1,3}\\.){3}(\\d{1,3})");
/** Returns the domain name of the url. The domain name of a url is