You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2011/10/11 14:20:26 UTC

svn commit: r1181747 - in /nutch/trunk: ./ src/java/org/apache/nutch/util/ src/plugin/parse-html/ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/ src/plugin/parse-html/src/test/org/apache/nutch/parse/html/ src/plugin/parse-tika/src/java/org...

Author: ab
Date: Tue Oct 11 12:20:25 2011
New Revision: 1181747

URL: http://svn.apache.org/viewvc?rev=1181747&view=rev
Log:
NUTCH-797 Fix parse-tika and parse-html to use relative URL resolution per RFC-3986.

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
    nutch/trunk/src/plugin/parse-html/ivy.xml
    nutch/trunk/src/plugin/parse-html/plugin.xml
    nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
    nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
    nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
    nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1181747&r1=1181746&r2=1181747&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Oct 11 12:20:25 2011
@@ -2,6 +2,9 @@ Nutch Change Log
 
 Release 1.4 - Current development
 
+* NUTCH-797 Fix parse-tika and parse-html to use relative URL resolution per RFC-3986
+  (Robert Hohman, ab)
+
 * NUTCH-1154 Upgrade to Tika 0.10. NOTE: Tika's new RTF parser may ignore more
   text in malformed documents than previously - see TIKA-748 for details. (ab)
 

Modified: nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java?rev=1181747&r1=1181746&r2=1181747&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Tue Oct 11 12:20:25 2011
@@ -26,6 +26,108 @@ import org.apache.nutch.util.domain.Doma
 
 /** Utility class for URL analysis */
 public class URLUtil {
+  
+  /**
+   * Resolve relative URL-s and fix a few java.net.URL errors
+   * in handling of URLs with embedded params and pure query
+   * targets.
+   * @param base base url
+   * @param target target url (may be relative)
+   * @return resolved absolute url.
+   * @throws MalformedURLException
+   */
+  public static URL resolveURL(URL base, String target)
+          throws MalformedURLException {
+    target = target.trim();
+
+    /* this is probably not needed anymore - see NUTCH-797.
+    // handle params that are embedded into the base url - move them to target
+    // so URL class constructs the new url class properly
+    if (base.toString().indexOf(';') > 0)
+      return fixEmbeddedParams(base, target);
+    */
+    
+    // handle the case that there is a target that is a pure query,
+    // for example
+    // http://careers3.accenture.com/Careers/ASPX/Search.aspx?co=0&sk=0
+    // It has urls in the page of the form href="?co=0&sk=0&pg=1", and by
+    // default
+    // URL constructs the base+target combo as
+    // http://careers3.accenture.com/Careers/ASPX/?co=0&sk=0&pg=1, incorrectly
+    // dropping the Search.aspx target
+    //
+    // Browsers handle these just fine, they must have an exception similar to
+    // this
+    if (target.startsWith("?")) {
+      return fixPureQueryTargets(base, target);
+    }
+
+    return new URL(base, target);
+  }
+
+  /** Handle the case in RFC3986 section 5.4.1 example 7, and similar. */
+   static URL fixPureQueryTargets(URL base, String target)
+          throws MalformedURLException {
+    if (!target.startsWith("?")) return new URL(base, target);
+
+    String basePath = base.getPath();
+    String baseRightMost = "";
+    int baseRightMostIdx = basePath.lastIndexOf("/");
+    if (baseRightMostIdx != -1) {
+      baseRightMost = basePath.substring(baseRightMostIdx + 1);
+    }
+
+    if (target.startsWith("?")) target = baseRightMost + target;
+
+    return new URL(base, target);
+  }
+
+  /**
+   * Handles cases where the url param information is encoded into the base url
+   * as opposed to the target.
+   * <p>
+   * If the taget contains params (i.e. ';xxxx') information then the target
+   * params information is assumed to be correct and any base params information
+   * is ignored. If the base contains params information but the tareget does
+   * not, then the params information is moved to the target allowing it to be
+   * correctly determined by the java.net.URL class.
+   * 
+   * @param base
+   *          The base URL.
+   * @param target
+   *          The target path from the base URL.
+   * 
+   * @return URL A URL with the params information correctly encoded.
+   * 
+   * @throws MalformedURLException
+   *           If the url is not a well formed URL.
+   */
+  private static URL fixEmbeddedParams(URL base, String target)
+          throws MalformedURLException {
+
+    // the target contains params information or the base doesn't then no
+    // conversion necessary, return regular URL
+    if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
+      return new URL(base, target);
+    }
+
+    // get the base url and it params information
+    String baseURL = base.toString();
+    int startParams = baseURL.indexOf(';');
+    String params = baseURL.substring(startParams);
+
+    // if the target has a query string then put the params information after
+    // any path but before the query string, otherwise just append to the path
+    int startQS = target.indexOf('?');
+    if (startQS >= 0) {
+      target = target.substring(0, startQS) + params
+              + target.substring(startQS);
+    } else {
+      target += params;
+    }
+
+    return new URL(base, target);
+  }
 
   private static Pattern IP_PATTERN = Pattern.compile("(\\d{1,3}\\.){3}(\\d{1,3})");
 

Modified: nutch/trunk/src/plugin/parse-html/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/ivy.xml?rev=1181747&r1=1181746&r2=1181747&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/ivy.xml (original)
+++ nutch/trunk/src/plugin/parse-html/ivy.xml Tue Oct 11 12:20:25 2011
@@ -36,7 +36,7 @@
   </publications>
 
   <dependencies>
-   <dependency org="org.ccil.cowan.tagsoup" name="tagsoup" rev="1.2"/>
+   <dependency org="org.ccil.cowan.tagsoup" name="tagsoup" rev="1.2.1"/>
   </dependencies>
 
 </ivy-module>

Modified: nutch/trunk/src/plugin/parse-html/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/plugin.xml?rev=1181747&r1=1181746&r2=1181747&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/plugin.xml (original)
+++ nutch/trunk/src/plugin/parse-html/plugin.xml Tue Oct 11 12:20:25 2011
@@ -25,7 +25,7 @@
       <library name="parse-html.jar">
          <export name="*"/>
       </library>
-      <library name="tagsoup-1.2.jar"/>
+      <library name="tagsoup-1.2.1.jar"/>
    </runtime>
 
    <requires>

Modified: nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=1181747&r1=1181746&r2=1181747&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Tue Oct 11 12:20:25 2011
@@ -26,6 +26,7 @@ import java.util.Stack;
 
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.util.NodeWalker;
+import org.apache.nutch.util.URLUtil;
 import org.apache.hadoop.conf.Configuration;
 
 import org.w3c.dom.*;
@@ -39,8 +40,6 @@ import org.w3c.dom.*;
  */
 public class DOMContentUtils {
 
-  private boolean fixEmbeddedParams;
-
   public static class LinkParams {
     public String elName;
     public String attrName;
@@ -57,7 +56,7 @@ public class DOMContentUtils {
       }
   }
   
-  private HashMap linkParams = new HashMap();
+  private HashMap<String,LinkParams> linkParams = new HashMap<String,LinkParams>();
   private Configuration conf;
   
   public DOMContentUtils(Configuration conf) {
@@ -89,9 +88,6 @@ public class DOMContentUtils {
       if ( ! forceTags.contains(ignoreTags[i]) )
         linkParams.remove(ignoreTags[i]);
     }
-
-    // https://issues.apache.org/jira/browse/NUTCH-1115
-    fixEmbeddedParams = conf.getBoolean("parser.fix.embeddedparams", true);
   }
   
   /**
@@ -305,51 +301,6 @@ public class DOMContentUtils {
   }
   
   /**
-   * Handles cases where the url param information is encoded into the base
-   * url as opposed to the target.
-   * <p>
-   * If the taget contains params (i.e. ';xxxx') information then the target 
-   * params information is assumed to be correct and any base params information
-   * is ignored.  If the base contains params information but the tareget does
-   * not, then the params information is moved to the target allowing it to be
-   * correctly determined by the java.net.URL class.
-   * 
-   * @param base The base URL.
-   * @param target The target path from the base URL.
-   * 
-   * @return URL A URL with the params information correctly encoded.
-   * 
-   * @throws MalformedURLException If the url is not a well formed URL.
-   */
-  private URL fixEmbeddedParams(URL base, String target) 
-    throws MalformedURLException{
-    
-    // the target contains params information or the base doesn't then no
-    // conversion necessary, return regular URL
-    if (!fixEmbeddedParams || target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
-      return new URL(base, target);
-    }
-    
-    // get the base url and it params information
-    String baseURL = base.toString();
-    int startParams = baseURL.indexOf(';');
-    String params = baseURL.substring(startParams);
-    
-    // if the target has a query string then put the params information after
-    // any path but before the query string, otherwise just append to the path
-    int startQS = target.indexOf('?');
-    if (startQS >= 0) {
-      target = target.substring(0, startQS) + params + 
-        target.substring(startQS);
-    }
-    else {
-      target += params;
-    }
-    
-    return new URL(base, target);
-  }
-
-  /**
    * This method finds all anchors below the supplied DOM
    * <code>node</code>, and creates appropriate {@link Outlink}
    * records for each (relative to the supplied <code>base</code>
@@ -363,7 +314,7 @@ public class DOMContentUtils {
    * nodes (this is a common DOM-fixup artifact, at least with
    * nekohtml).
    */
-  public void getOutlinks(URL base, ArrayList outlinks, 
+  public void getOutlinks(URL base, ArrayList<Outlink> outlinks, 
                                        Node node) {
     
     NodeWalker walker = new NodeWalker(node);
@@ -434,8 +385,7 @@ public class DOMContentUtils {
             if (target != null && !noFollow && !post)
               try {
                 
-                URL url = (base.toString().indexOf(';') > 0) ? 
-                  fixEmbeddedParams(base, target) :  new URL(base, target);
+                URL url = URLUtil.resolveURL(base, target);
                 outlinks.add(new Outlink(url.toString(),
                                          linkText.toString().trim()));
               } catch (MalformedURLException e) {

Modified: nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=1181747&r1=1181746&r2=1181747&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Tue Oct 11 12:20:25 2011
@@ -294,11 +294,12 @@ public class TestDOMContentUtils extends
            new Outlink("http://www.nutch.org/g;x?y#s", "anchor3")
          },
          {
-           new Outlink("http://www.nutch.org/g;something", "anchor1"),
-           new Outlink("http://www.nutch.org/g;something?y#s", "anchor2"),
+           // this is tricky - see RFC3986 section 5.4.1 example 7
+           new Outlink("http://www.nutch.org/g", "anchor1"),
+           new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
            new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
            new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
-           new Outlink("http://www.nutch.org/?y=1;somethingelse", "anchor5")
+           new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5")
          },
          {
            new Outlink("http://www.nutch.org/g", ""),

Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java?rev=1181747&r1=1181746&r2=1181747&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java (original)
+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java Tue Oct 11 12:20:25 2011
@@ -26,6 +26,7 @@ import java.util.HashMap;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.util.NodeWalker;
+import org.apache.nutch.util.URLUtil;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
@@ -39,8 +40,6 @@ import org.w3c.dom.NodeList;
  */
 class DOMContentUtils {
 
-  private boolean fixEmbeddedParams;
-
   private static class LinkParams {
 	private String elName;
 	private String attrName;
@@ -57,7 +56,7 @@ class DOMContentUtils {
       }
   }
   
-  private HashMap linkParams = new HashMap();
+  private HashMap<String,LinkParams> linkParams = new HashMap<String,LinkParams>();
   private Configuration conf;
   
   DOMContentUtils(Configuration conf) {
@@ -89,9 +88,6 @@ class DOMContentUtils {
       if ( ! forceTags.contains(ignoreTags[i]) )
         linkParams.remove(ignoreTags[i]);
     }
-
-    // https://issues.apache.org/jira/browse/NUTCH-1115
-    fixEmbeddedParams = conf.getBoolean("parser.fix.embeddedparams", true);
   }
   
   /**
@@ -305,51 +301,6 @@ class DOMContentUtils {
   }
   
   /**
-   * Handles cases where the url param information is encoded into the base
-   * url as opposed to the target.
-   * <p>
-   * If the taget contains params (i.e. ';xxxx') information then the target 
-   * params information is assumed to be correct and any base params information
-   * is ignored.  If the base contains params information but the tareget does
-   * not, then the params information is moved to the target allowing it to be
-   * correctly determined by the java.net.URL class.
-   * 
-   * @param base The base URL.
-   * @param target The target path from the base URL.
-   * 
-   * @return URL A URL with the params information correctly encoded.
-   * 
-   * @throws MalformedURLException If the url is not a well formed URL.
-   */
-  private URL fixEmbeddedParams(URL base, String target) 
-    throws MalformedURLException{
-
-    // the target contains params information or the base doesn't then no
-    // conversion necessary, return regular URL
-    if (!fixEmbeddedParams || target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
-      return new URL(base, target);
-    }
-    
-    // get the base url and it params information
-    String baseURL = base.toString();
-    int startParams = baseURL.indexOf(';');
-    String params = baseURL.substring(startParams);
-    
-    // if the target has a query string then put the params information after
-    // any path but before the query string, otherwise just append to the path
-    int startQS = target.indexOf('?');
-    if (startQS >= 0) {
-      target = target.substring(0, startQS) + params + 
-        target.substring(startQS);
-    }
-    else {
-      target += params;
-    }
-
-    return new URL(base, target);
-  }
-
-  /**
    * This method finds all anchors below the supplied DOM
    * <code>node</code>, and creates appropriate {@link Outlink}
    * records for each (relative to the supplied <code>base</code>
@@ -363,7 +314,7 @@ class DOMContentUtils {
    * nodes (this is a common DOM-fixup artifact, at least with
    * nekohtml).
    */
-  void getOutlinks(URL base, ArrayList outlinks, 
+  void getOutlinks(URL base, ArrayList<Outlink> outlinks, 
                                        Node node) {
     
     NodeWalker walker = new NodeWalker(node);
@@ -405,8 +356,7 @@ class DOMContentUtils {
             if (target != null && !noFollow && !post)
               try {
                 
-                URL url = (base.toString().indexOf(';') > 0) ? 
-                  fixEmbeddedParams(base, target) :  new URL(base, target);
+                URL url = URLUtil.resolveURL(base, target);
                 outlinks.add(new Outlink(url.toString(),
                                          linkText.toString().trim()));
               } catch (MalformedURLException e) {

Modified: nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java?rev=1181747&r1=1181746&r2=1181747&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java Tue Oct 11 12:20:25 2011
@@ -213,4 +213,50 @@ public class TestURLUtil
     assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aSubDotCom, true));
   }
 
+  // from RFC3986 section 5.4.1
+  private static String baseString = "http://a/b/c/d;p?q";
+  private static String[][] targets = new String[][] {
+    // unknown protocol {"g:h"           ,  "g:h"},
+    {"g"             ,  "http://a/b/c/g"},
+    { "./g"           ,  "http://a/b/c/g"},
+    { "g/"            ,  "http://a/b/c/g/"},
+    { "/g"            ,  "http://a/g"},
+    { "//g"           ,  "http://g"},
+    { "?y"            ,  "http://a/b/c/d;p?y"},
+    { "g?y"           ,  "http://a/b/c/g?y"},
+    { "#s"            ,  "http://a/b/c/d;p?q#s"},
+    { "g#s"           ,  "http://a/b/c/g#s"},
+    { "g?y#s"         ,  "http://a/b/c/g?y#s"},
+    { ";x"            ,  "http://a/b/c/;x"},
+    { "g;x"           ,  "http://a/b/c/g;x"},
+    { "g;x?y#s"       ,  "http://a/b/c/g;x?y#s"},
+    { ""              ,  "http://a/b/c/d;p?q"},
+    { "."             ,  "http://a/b/c/"},
+    { "./"            ,  "http://a/b/c/"},
+    { ".."            ,  "http://a/b/"},
+    { "../"           ,  "http://a/b/"},
+    { "../g"          ,  "http://a/b/g"},
+    { "../.."         ,  "http://a/"},
+    { "../../"        ,  "http://a/"},
+    { "../../g"       ,  "http://a/g"}
+  };
+
+  public void testResolveURL() throws Exception {
+    // test NUTCH-436
+    URL u436 = new URL("http://a/b/c/d;p?q#f");
+    assertEquals("http://a/b/c/d;p?q#f", u436.toString());
+    URL abs = URLUtil.resolveURL(u436, "?y");
+    assertEquals("http://a/b/c/d;p?y", abs.toString());
+    // test NUTCH-566
+    URL u566 = new URL("http://www.fleurie.org/entreprise.asp");
+    abs = URLUtil.resolveURL(u566, "?id_entrep=111");
+    assertEquals("http://www.fleurie.org/entreprise.asp?id_entrep=111", abs.toString());
+    URL base = new URL(baseString);
+    assertEquals("base url parsing", baseString, base.toString());
+    for (int i = 0; i < targets.length; i++) {
+      URL u = URLUtil.resolveURL(base, targets[i][0]);
+      assertEquals(targets[i][1], targets[i][1], u.toString());
+    }
+  }
+
 }