You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ku...@apache.org on 2007/03/10 18:40:23 UTC

svn commit: r516757 - in /lucene/nutch/trunk/src/plugin/parse-html/src: java/org/apache/nutch/parse/html/DOMContentUtils.java test/org/apache/nutch/parse/html/TestDOMContentUtils.java

Author: kubes
Date: Sat Mar 10 09:40:20 2007
New Revision: 516757

URL: http://svn.apache.org/viewvc?view=rev&rev=516757
Log:
NUTCH-436 resolved.  Fixed behavior of urls with param 
(i.e. ;xxxx) information.  Finally found workaround for
problems that I was experiencing with EOL characters.

Modified:
    lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
    lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java

Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?view=diff&rev=516757&r1=516756&r2=516757
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Sat Mar 10 09:40:20 2007
@@ -282,6 +282,51 @@
 
     return false;
   }
+  
+  /**
+   * Handles cases where the url param information is encoded into the base
+   * url as opposed to the target.
+   * <p>
+   * If the taget contains params (i.e. ';xxxx') information then the target 
+   * params information is assumed to be correct and any base params information
+   * is ignored.  If the base contains params information but the tareget does
+   * not, then the params information is moved to the target allowing it to be
+   * correctly determined by the java.net.URL class.
+   * 
+   * @param base The base URL.
+   * @param target The target path from the base URL.
+   * 
+   * @return URL A URL with the params information correctly encoded.
+   * 
+   * @throws MalformedURLException If the url is not a well formed URL.
+   */
+  private URL fixEmbeddedParams(URL base, String target) 
+    throws MalformedURLException{
+    
+    // the target contains params information or the base doesn't then no
+    // conversion necessary, return regular URL
+    if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
+      return new URL(base, target);
+    }
+    
+    // get the base url and it params information
+    String baseURL = base.toString();
+    int startParams = baseURL.indexOf(';');
+    String params = baseURL.substring(startParams);
+    
+    // if the target has a query string then put the params information after
+    // any path but before the query string, otherwise just append to the path
+    int startQS = target.indexOf('?');
+    if (startQS >= 0) {
+      target = target.substring(0, startQS) + params + 
+        target.substring(startQS);
+    }
+    else {
+      target += params;
+    }
+    
+    return new URL(base, target);
+  }
 
   /**
    * This method finds all anchors below the supplied DOM
@@ -333,7 +378,9 @@
           }
           if (target != null && !noFollow && !post)
             try {
-              URL url = new URL(base, target);
+              
+              URL url = (base.toString().indexOf(';') > 0) ? 
+                fixEmbeddedParams(base, target) :  new URL(base, target);
               outlinks.add(new Outlink(url.toString(),
                                        linkText.toString().trim(), conf));
             } catch (MalformedURLException e) {

Modified: lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?view=diff&rev=516757&r1=516756&r2=516757
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Sat Mar 10 09:40:20 2007
@@ -134,6 +134,20 @@
             + "<input type=submit><p>test1</p></form>"
             + "<form method='GET' action='/dummy.jsp'><input type=text>"
             + "<input type=submit><p>test2</p></form></body></html>"),
+    new String("<html><head><title> title </title>"
+      + "</head><body>"
+      + "<a href=\";x\">anchor1</a>"
+      + "<a href=\"g;x\">anchor2</a>"
+      + "<a href=\"g;x?y#s\">anchor3</a>"
+      + "</body></html>"),  
+    new String("<html><head><title> title </title>"
+        + "</head><body>"
+        + "<a href=\"g\">anchor1</a>"
+        + "<a href=\"g?y#s\">anchor2</a>"
+        + "<a href=\"?y=1\">anchor3</a>"
+        + "<a href=\"?y=1#s\">anchor4</a>"
+        + "<a href=\"?y=1;somethingelse\">anchor5</a>"
+        + "</body></html>"), 
   };
   
   private static int SKIP = 9;
@@ -149,6 +163,8 @@
     "http://www.nutch.org//",
     "http://www.nutch.org/",
     "http://www.nutch.org/",
+    "http://www.nutch.org/",
+    "http://www.nutch.org/;something"
   };
     
   private static final DocumentFragment testDOMs[]=
@@ -173,7 +189,9 @@
         + "End this madness ! . . . .",
     "ignore ignore",
     "test1 test2",
-    "test1 test2"
+    "test1 test2",
+    "title anchor1 anchor2 anchor3",
+    "title anchor1 anchor2 anchor3 anchor4 anchor5"
   };
 
   private static final String[] answerTitle= {
@@ -186,7 +204,9 @@
     "my title",
     "",
     "",
-    ""
+    "",
+    "title",
+    "title"
   };
 
   // note: should be in page-order
@@ -258,6 +278,18 @@
            new Outlink("http://www.nutch.org/dummy.jsp", "test2", conf),
          },
          {
+         },
+         {
+           new Outlink("http://www.nutch.org/;x", "anchor1", conf),
+           new Outlink("http://www.nutch.org/g;x", "anchor2", conf),
+           new Outlink("http://www.nutch.org/g;x?y#s", "anchor3", conf)
+         },
+         {
+           new Outlink("http://www.nutch.org/g;something", "anchor1", conf),
+           new Outlink("http://www.nutch.org/g;something?y#s", "anchor2", conf),
+           new Outlink("http://www.nutch.org/;something?y=1", "anchor3", conf),
+           new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4", conf),
+           new Outlink("http://www.nutch.org/?y=1;somethingelse", "anchor5", conf)
          }
       };