You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/04/28 23:09:02 UTC

svn commit: r1590796 - in /nutch/branches/2.x: CHANGES.txt src/java/org/apache/nutch/util/URLUtil.java src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java src/test/org/apache/nutch/util/TestURLUtil.java

Author: snagel
Date: Mon Apr 28 21:09:01 2014
New Revision: 1590796

URL: http://svn.apache.org/r1590796
Log:
NUTCH-797 URL not properly constructed when link target begins with a "?"

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
    nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
    nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1590796&r1=1590795&r2=1590796&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Apr 28 21:09:01 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-797 URL not properly constructed when link target begins with a "?" (Doug Cook, Robert Hohman, Stondet, ab via snagel)
+
 * NUTCH-1759 Upgrade to Crawler Commons 0.4 (jnioche)
 
 * NUTCH-1700 Remove deprecated code in src/plugin/creativecommons/build.xml (lewismc)

Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java?rev=1590796&r1=1590795&r2=1590796&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java Mon Apr 28 21:09:01 2014
@@ -27,6 +27,54 @@ import org.apache.nutch.util.domain.Doma
 /** Utility class for URL analysis */
 public class URLUtil {
 
+  /**
+   * Resolve relative URL-s and fix a few java.net.URL errors
+   * in handling of URLs with embedded params and pure query
+   * targets.
+   * @param base base url
+   * @param target target url (may be relative)
+   * @return resolved absolute url.
+   * @throws MalformedURLException
+   */
+  public static URL resolveURL(URL base, String target)
+          throws MalformedURLException {
+    target = target.trim();
+
+    // handle the case that there is a target that is a pure query,
+    // for example
+    // http://careers3.accenture.com/Careers/ASPX/Search.aspx?co=0&sk=0
+    // It has urls in the page of the form href="?co=0&sk=0&pg=1", and by
+    // default
+    // URL constructs the base+target combo as
+    // http://careers3.accenture.com/Careers/ASPX/?co=0&sk=0&pg=1, incorrectly
+    // dropping the Search.aspx target
+    //
+    // Browsers handle these just fine, they must have an exception similar to
+    // this
+    if (target.startsWith("?")) {
+      return fixPureQueryTargets(base, target);
+    }
+
+    return new URL(base, target);
+  }
+
+  /** Handle the case in RFC3986 section 5.4.1 example 7, and similar. */
+   static URL fixPureQueryTargets(URL base, String target)
+          throws MalformedURLException {
+    if (!target.startsWith("?")) return new URL(base, target);
+
+    String basePath = base.getPath();
+    String baseRightMost = "";
+    int baseRightMostIdx = basePath.lastIndexOf("/");
+    if (baseRightMostIdx != -1) {
+      baseRightMost = basePath.substring(baseRightMostIdx + 1);
+    }
+
+    if (target.startsWith("?")) target = baseRightMost + target;
+
+    return new URL(base, target);
+  }
+
   private static Pattern IP_PATTERN = Pattern.compile("(\\d{1,3}\\.){3}(\\d{1,3})");
 
   /** Returns the domain name of the url. The domain name of a url is

Modified: nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=1590796&r1=1590795&r2=1590796&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Mon Apr 28 21:09:01 2014
@@ -25,6 +25,7 @@ import java.util.HashMap;
 
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.util.NodeWalker;
+import org.apache.nutch.util.URLUtil;
 import org.apache.hadoop.conf.Configuration;
 
 import org.w3c.dom.*;
@@ -338,7 +339,7 @@ public class DOMContentUtils {
       target += params;
     }
     
-    return new URL(base, target);
+    return URLUtil.resolveURL(base, target);
   }
 
   /**
@@ -398,7 +399,7 @@ public class DOMContentUtils {
               try {
                 
                 URL url = (base.toString().indexOf(';') > 0) ? 
-                  fixEmbeddedParams(base, target) :  new URL(base, target);
+                  fixEmbeddedParams(base, target) :  URLUtil.resolveURL(base, target);
                 outlinks.add(new Outlink(url.toString(),
                                          linkText.toString().trim()));
               } catch (MalformedURLException e) {

Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java?rev=1590796&r1=1590795&r2=1590796&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java Mon Apr 28 21:09:01 2014
@@ -211,6 +211,53 @@ public class TestURLUtil {
     assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aSubDotCom, true));
   }
   
+  // from RFC3986 section 5.4.1
+  private static String baseString = "http://a/b/c/d;p?q";
+  private static String[][] targets = new String[][] {
+    // unknown protocol {"g:h"           ,  "g:h"},
+    {"g"             ,  "http://a/b/c/g"},
+    { "./g"           ,  "http://a/b/c/g"},
+    { "g/"            ,  "http://a/b/c/g/"},
+    { "/g"            ,  "http://a/g"},
+    { "//g"           ,  "http://g"},
+    { "?y"            ,  "http://a/b/c/d;p?y"},
+    { "g?y"           ,  "http://a/b/c/g?y"},
+    { "#s"            ,  "http://a/b/c/d;p?q#s"},
+    { "g#s"           ,  "http://a/b/c/g#s"},
+    { "g?y#s"         ,  "http://a/b/c/g?y#s"},
+    { ";x"            ,  "http://a/b/c/;x"},
+    { "g;x"           ,  "http://a/b/c/g;x"},
+    { "g;x?y#s"       ,  "http://a/b/c/g;x?y#s"},
+    { ""              ,  "http://a/b/c/d;p?q"},
+    { "."             ,  "http://a/b/c/"},
+    { "./"            ,  "http://a/b/c/"},
+    { ".."            ,  "http://a/b/"},
+    { "../"           ,  "http://a/b/"},
+    { "../g"          ,  "http://a/b/g"},
+    { "../.."         ,  "http://a/"},
+    { "../../"        ,  "http://a/"},
+    { "../../g"       ,  "http://a/g"}
+  };
+
+  @Test
+  public void testResolveURL() throws Exception {
+    // test NUTCH-436
+    URL u436 = new URL("http://a/b/c/d;p?q#f");
+    assertEquals("http://a/b/c/d;p?q#f", u436.toString());
+    URL abs = URLUtil.resolveURL(u436, "?y");
+    assertEquals("http://a/b/c/d;p?y", abs.toString());
+    // test NUTCH-566
+    URL u566 = new URL("http://www.fleurie.org/entreprise.asp");
+    abs = URLUtil.resolveURL(u566, "?id_entrep=111");
+    assertEquals("http://www.fleurie.org/entreprise.asp?id_entrep=111", abs.toString());
+    URL base = new URL(baseString);
+    assertEquals("base url parsing", baseString, base.toString());
+    for (int i = 0; i < targets.length; i++) {
+      URL u = URLUtil.resolveURL(base, targets[i][0]);
+      assertEquals(targets[i][1], targets[i][1], u.toString());
+    }
+  }
+  
   @Test
   public void testToUNICODE() throws Exception {
     assertEquals("http://www.çevir.com", URLUtil.toUNICODE("http://www.xn--evir-zoa.com"));