You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/10/21 23:04:57 UTC

svn commit: r327581 - in /lucene/nutch/branches/mapred/src/plugin/parse-html/src: java/org/apache/nutch/parse/html/DOMContentUtils.java test/org/apache/nutch/parse/html/TestDOMContentUtils.java

Author: cutting
Date: Fri Oct 21 14:04:54 2005
New Revision: 327581

URL: http://svn.apache.org/viewcvs?rev=327581&view=rev
Log:
Ignore rel=nofollow links.

Modified:
    lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
    lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java

Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=327581&r1=327580&r2=327581&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Fri Oct 21 14:04:54 2005
@@ -306,13 +306,21 @@
 
           NamedNodeMap attrs = node.getAttributes();
           String target = null;
+          boolean noFollow = false;
           for (int i= 0; i < attrs.getLength(); i++ ) {
-            if (params.attrName.equalsIgnoreCase(attrs.item(i).getNodeName())) {
-              target = attrs.item(i).getNodeValue();
-              break;
+            Node attr = attrs.item(i);
+            String attrName = attr.getNodeName();
+
+            if ("rel".equalsIgnoreCase(attrName) &&
+                "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+              noFollow = true;
+            }
+
+            if (params.attrName.equalsIgnoreCase(attrName)) {
+              target = attr.getNodeValue();
             }
           }
-          if (target != null)
+          if (target != null && !noFollow)
             try {
               URL url = new URL(base, target);
               outlinks.add(new Outlink(url.toString(),

Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=327581&r1=327580&r2=327581&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Fri Oct 21 14:04:54 2005
@@ -113,6 +113,12 @@
                + "<h2>End\tthis\rmadness\n!</h2>\r\n"
                + "         .        .        .         ."
                + "</body>  </html>"),
+
+    // test that <a rel=nofollow> links are not returned
+    new String("<html><head></head><body>"
+               + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
+               + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
+               + "</body></html>"),
   };
 
   private static String[] testBaseHrefs= {
@@ -123,6 +129,7 @@
     "http://www.nutch.org/frames/",     
     "http://www.nutch.org/maps/",
     "http://www.nutch.org/whitespace/",
+    "http://www.nutch.org//",
   };
   
   private static final DocumentFragment testDOMs[]=
@@ -145,6 +152,7 @@
         + "one two three space here space there no space "
         + "one two two three three four put some text here and there. "
         + "End this madness ! . . . .",
+    "ignore ignore",
   };
 
   private static final String[] answerTitle= {
@@ -155,6 +163,7 @@
     "my title",
     "my title",
     "my title",
+    "",
   };
 
   // note: should be in page-order
@@ -214,6 +223,8 @@
          {
              new Outlink("http://www.nutch.org/index.html", "whitespace test"),
          },
+         {
+         }
       };
    
     } catch (MalformedURLException e) {