You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by pk...@apache.org on 2006/03/25 12:19:37 UTC

svn commit: r388742 - in /lucene/nutch/branches/branch-0.7: CHANGES.txt src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java

Author: pkosiorowski
Date: Sat Mar 25 03:19:31 2006
New Revision: 388742

URL: http://svn.apache.org/viewcvs?rev=388742&view=rev
Log:
Skipping post and nofollow outlinks

Modified:
    lucene/nutch/branches/branch-0.7/CHANGES.txt
    lucene/nutch/branches/branch-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java

Modified: lucene/nutch/branches/branch-0.7/CHANGES.txt
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=388742&r1=388741&r2=388742&view=diff
==============================================================================
--- lucene/nutch/branches/branch-0.7/CHANGES.txt (original)
+++ lucene/nutch/branches/branch-0.7/CHANGES.txt Sat Mar 25 03:19:31 2006
@@ -22,6 +22,8 @@
 
  9. Commons HTTPClient upgraded to version 3.0.
 
+10. Skipping "post" and "nofollow" outlinks.
+
 
 Release 0.7.1 - 2005-10-01
 

Modified: lucene/nutch/branches/branch-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=388742&r1=388741&r2=388742&view=diff
==============================================================================
--- lucene/nutch/branches/branch-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ lucene/nutch/branches/branch-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Sat Mar 25 03:19:31 2006
@@ -296,23 +296,29 @@
     if (node.getNodeType() == Node.ELEMENT_NODE) {
       LinkParams params = (LinkParams)linkParams.get(node.getNodeName().toLowerCase());
       if (params != null) {
-        if (shouldThrowAwayLink(node, children, childLen, params)) {
-          // this has no inner structure or just a single nested
-          // anchor-- toss it!
-        } else {
+        if (!shouldThrowAwayLink(node, children, childLen, params)) {
 
           StringBuffer linkText = new StringBuffer();
           getText(linkText, node, true);
 
           NamedNodeMap attrs = node.getAttributes();
           String target = null;
+          boolean noFollow = false;
+          boolean post = false;
           for (int i= 0; i < attrs.getLength(); i++ ) {
-            if (params.attrName.equalsIgnoreCase(attrs.item(i).getNodeName())) {
-              target = attrs.item(i).getNodeValue();
-              break;
+            Node attr = attrs.item(i);
+            String attrName = attr.getNodeName();
+            if (params.attrName.equalsIgnoreCase(attrName)) {
+              target = attr.getNodeValue();
+            } else if ("rel".equalsIgnoreCase(attrName) &&
+                       "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+              noFollow = true;
+            } else if ("method".equalsIgnoreCase(attrName) &&
+                       "post".equalsIgnoreCase(attr.getNodeValue())) {
+              post = true;
             }
           }
-          if (target != null)
+          if (target != null && !noFollow && !post)
             try {
               URL url = new URL(base, target);
               outlinks.add(new Outlink(url.toString(),