You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/06/21 00:56:33 UTC

svn commit: r1604298 - in /nutch: branches/2.x/ branches/2.x/src/java/org/apache/nutch/util/ branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/ branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/ branches/2.x/...

Author: snagel
Date: Fri Jun 20 22:56:32 2014
New Revision: 1604298

URL: http://svn.apache.org/r1604298
Log:
NUTCH-1767 remove special treatment of "params" in relative links

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
    nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
    nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
    nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
    nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1604298&r1=1604297&r2=1604298&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Jun 20 22:56:32 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1767 remove special treatment of "params" in relative links (snagel)
+
 * NUTCH-1718 redefine http.robots.agent as "additional agent names" (snagel, Tejas Patil, Daniel Kugel)
 
 * NUTCH-1796 Ensure Gora object builders are used as oppose to empty constructors (snagel via lewismc)

Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java?rev=1604298&r1=1604297&r2=1604298&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java Fri Jun 20 22:56:32 2014
@@ -28,9 +28,8 @@ import org.apache.nutch.util.domain.Doma
 public class URLUtil {
 
   /**
-   * Resolve relative URL-s and fix a few java.net.URL errors
-   * in handling of URLs with embedded params and pure query
-   * targets.
+   * Resolve relative URL-s and fix a java.net.URL error
+   * in handling of URLs with pure query targets.
    * @param base base url
    * @param target target url (may be relative)
    * @return resolved absolute url.

Modified: nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=1604298&r1=1604297&r2=1604298&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Fri Jun 20 22:56:32 2014
@@ -298,51 +298,6 @@ public class DOMContentUtils {
   }
   
   /**
-   * Handles cases where the url param information is encoded into the base
-   * url as opposed to the target.
-   * <p>
-   * If the taget contains params (i.e. ';xxxx') information then the target 
-   * params information is assumed to be correct and any base params information
-   * is ignored.  If the base contains params information but the tareget does
-   * not, then the params information is moved to the target allowing it to be
-   * correctly determined by the java.net.URL class.
-   * 
-   * @param base The base URL.
-   * @param target The target path from the base URL.
-   * 
-   * @return URL A URL with the params information correctly encoded.
-   * 
-   * @throws MalformedURLException If the url is not a well formed URL.
-   */
-  private URL fixEmbeddedParams(URL base, String target) 
-    throws MalformedURLException{
-    
-    // the target contains params information or the base doesn't then no
-    // conversion necessary, return regular URL
-    if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
-      return new URL(base, target);
-    }
-    
-    // get the base url and it params information
-    String baseURL = base.toString();
-    int startParams = baseURL.indexOf(';');
-    String params = baseURL.substring(startParams);
-    
-    // if the target has a query string then put the params information after
-    // any path but before the query string, otherwise just append to the path
-    int startQS = target.indexOf('?');
-    if (startQS >= 0) {
-      target = target.substring(0, startQS) + params + 
-        target.substring(startQS);
-    }
-    else {
-      target += params;
-    }
-    
-    return URLUtil.resolveURL(base, target);
-  }
-
-  /**
    * This method finds all anchors below the supplied DOM
    * <code>node</code>, and creates appropriate {@link Outlink}
    * records for each (relative to the supplied <code>base</code>
@@ -397,9 +352,8 @@ public class DOMContentUtils {
             }
             if (target != null && !noFollow && !post)
               try {
-                
-                URL url = (base.toString().indexOf(';') > 0) ? 
-                  fixEmbeddedParams(base, target) :  URLUtil.resolveURL(base, target);
+
+                URL url = URLUtil.resolveURL(base, target);
                 outlinks.add(new Outlink(url.toString(),
                                          linkText.toString().trim()));
               } catch (MalformedURLException e) {

Modified: nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=1604298&r1=1604297&r2=1604298&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Fri Jun 20 22:56:32 2014
@@ -289,11 +289,12 @@ public class TestDOMContentUtils {
           new Outlink("http://www.nutch.org/g;x?y#s", "anchor3")
         },
         {
-          new Outlink("http://www.nutch.org/g;something", "anchor1"),
-          new Outlink("http://www.nutch.org/g;something?y#s", "anchor2"),
+          // this is tricky - see RFC3986 section 5.4.1 example 7
+          new Outlink("http://www.nutch.org/g", "anchor1"),
+          new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
           new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
           new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
-          new Outlink("http://www.nutch.org/?y=1;somethingelse", "anchor5")
+          new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5")
         }
     };
 

Modified: nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java?rev=1604298&r1=1604297&r2=1604298&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java (original)
+++ nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java Fri Jun 20 22:56:32 2014
@@ -26,6 +26,7 @@ import java.util.HashMap;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.util.NodeWalker;
+import org.apache.nutch.util.URLUtil;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
@@ -298,51 +299,6 @@ public class DOMContentUtils {
   }
   
   /**
-   * Handles cases where the url param information is encoded into the base
-   * url as opposed to the target.
-   * <p>
-   * If the taget contains params (i.e. ';xxxx') information then the target 
-   * params information is assumed to be correct and any base params information
-   * is ignored.  If the base contains params information but the tareget does
-   * not, then the params information is moved to the target allowing it to be
-   * correctly determined by the java.net.URL class.
-   * 
-   * @param base The base URL.
-   * @param target The target path from the base URL.
-   * 
-   * @return URL A URL with the params information correctly encoded.
-   * 
-   * @throws MalformedURLException If the url is not a well formed URL.
-   */
-  private URL fixEmbeddedParams(URL base, String target) 
-    throws MalformedURLException{
-    
-    // the target contains params information or the base doesn't then no
-    // conversion necessary, return regular URL
-    if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
-      return new URL(base, target);
-    }
-    
-    // get the base url and it params information
-    String baseURL = base.toString();
-    int startParams = baseURL.indexOf(';');
-    String params = baseURL.substring(startParams);
-    
-    // if the target has a query string then put the params information after
-    // any path but before the query string, otherwise just append to the path
-    int startQS = target.indexOf('?');
-    if (startQS >= 0) {
-      target = target.substring(0, startQS) + params + 
-        target.substring(startQS);
-    }
-    else {
-      target += params;
-    }
-    
-    return new URL(base, target);
-  }
-
-  /**
    * This method finds all anchors below the supplied DOM
    * <code>node</code>, and creates appropriate {@link Outlink}
    * records for each (relative to the supplied <code>base</code>
@@ -397,9 +353,8 @@ public class DOMContentUtils {
             }
             if (target != null && !noFollow && !post)
               try {
-                
-                URL url = (base.toString().indexOf(';') > 0) ? 
-                  fixEmbeddedParams(base, target) :  new URL(base, target);
+
+                URL url = URLUtil.resolveURL(base, target);
                 outlinks.add(new Outlink(url.toString(),
                                          linkText.toString().trim()));
               } catch (MalformedURLException e) {

Modified: nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java?rev=1604298&r1=1604297&r2=1604298&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java (original)
+++ nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java Fri Jun 20 22:56:32 2014
@@ -285,11 +285,12 @@ public class DOMContentUtilsTest {
 				},
 				// 11
 				{
-				 new Outlink("http://www.nutch.org/g;something","anchor1"),
-				 new Outlink("http://www.nutch.org/g;something?y#s", "anchor2"),
+				 // this is tricky - see RFC3986 section 5.4.1 example 7
+				 new Outlink("http://www.nutch.org/g","anchor1"),
+				 new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
 				 new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
 				 new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
-				 new Outlink("http://www.nutch.org/?y=1;somethingelse", "anchor5") }
+				 new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5") }
 				};
 
 	}

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1604298&r1=1604297&r2=1604298&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jun 20 22:56:32 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1767 remove special treatment of "params" in relative links (snagel)
+
 * NUTCH-1718 redefine http.robots.agent as "additional agent names" (snagel, Tejas Patil, Daniel Kugel)
 
 * NUTCH-1794 IndexingFilterChecker to optionally dumpText (markus)

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1604298&r1=1604297&r2=1604298&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Jun 20 22:56:32 2014
@@ -1128,13 +1128,6 @@
 </property>
 
 <property>
-  <name>parser.fix.embeddedparams</name>
-  <value>true</value>
-  <description>Whether to fix URL embedded params using semi-colons.
-  See NUTCH-436 and NUTCH-1115</description>
-</property>
-
-<property>
   <name>htmlparsefilter.order</name>
   <value></value>
   <description>The order by which HTMLParse filters are applied.

Modified: nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java?rev=1604298&r1=1604297&r2=1604298&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Fri Jun 20 22:56:32 2014
@@ -28,9 +28,8 @@ import org.apache.nutch.util.domain.Doma
 public class URLUtil {
   
   /**
-   * Resolve relative URL-s and fix a few java.net.URL errors
-   * in handling of URLs with embedded params and pure query
-   * targets.
+   * Resolve relative URL-s and fix a java.net.URL error
+   * in handling of URLs with pure query targets.
    * @param base base url
    * @param target target url (may be relative)
    * @return resolved absolute url.
@@ -40,13 +39,6 @@ public class URLUtil {
           throws MalformedURLException {
     target = target.trim();
 
-    /* this is probably not needed anymore - see NUTCH-797.
-    // handle params that are embedded into the base url - move them to target
-    // so URL class constructs the new url class properly
-    if (base.toString().indexOf(';') > 0)
-      return fixEmbeddedParams(base, target);
-    */
-    
     // handle the case that there is a target that is a pure query,
     // for example
     // http://careers3.accenture.com/Careers/ASPX/Search.aspx?co=0&sk=0
@@ -82,53 +74,6 @@ public class URLUtil {
     return new URL(base, target);
   }
 
-  /**
-   * Handles cases where the url param information is encoded into the base url
-   * as opposed to the target.
-   * <p>
-   * If the taget contains params (i.e. ';xxxx') information then the target
-   * params information is assumed to be correct and any base params information
-   * is ignored. If the base contains params information but the tareget does
-   * not, then the params information is moved to the target allowing it to be
-   * correctly determined by the java.net.URL class.
-   * 
-   * @param base
-   *          The base URL.
-   * @param target
-   *          The target path from the base URL.
-   * 
-   * @return URL A URL with the params information correctly encoded.
-   * 
-   * @throws MalformedURLException
-   *           If the url is not a well formed URL.
-   */
-  private static URL fixEmbeddedParams(URL base, String target)
-          throws MalformedURLException {
-
-    // the target contains params information or the base doesn't then no
-    // conversion necessary, return regular URL
-    if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
-      return new URL(base, target);
-    }
-
-    // get the base url and it params information
-    String baseURL = base.toString();
-    int startParams = baseURL.indexOf(';');
-    String params = baseURL.substring(startParams);
-
-    // if the target has a query string then put the params information after
-    // any path but before the query string, otherwise just append to the path
-    int startQS = target.indexOf('?');
-    if (startQS >= 0) {
-      target = target.substring(0, startQS) + params
-              + target.substring(startQS);
-    } else {
-      target += params;
-    }
-
-    return new URL(base, target);
-  }
-
   private static Pattern IP_PATTERN = Pattern.compile("(\\d{1,3}\\.){3}(\\d{1,3})");
 
   /** Returns the domain name of the url. The domain name of a url is