You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/11/23 20:55:14 UTC

svn commit: r348533 - /lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java

Author: cutting
Date: Wed Nov 23 11:55:11 2005
New Revision: 348533

URL: http://svn.apache.org/viewcvs?rev=348533&view=rev
Log:
Fix to not extract urls whose method=post.

Modified:
    lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java

Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=348533&r1=348532&r2=348533&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Wed Nov 23 11:55:11 2005
@@ -296,10 +296,7 @@
     if (node.getNodeType() == Node.ELEMENT_NODE) {
       LinkParams params = (LinkParams)linkParams.get(node.getNodeName().toLowerCase());
       if (params != null) {
-        if (shouldThrowAwayLink(node, children, childLen, params)) {
-          // this has no inner structure or just a single nested
-          // anchor-- toss it!
-        } else {
+        if (!shouldThrowAwayLink(node, children, childLen, params)) {
 
           StringBuffer linkText = new StringBuffer();
           getText(linkText, node, true);
@@ -307,20 +304,21 @@
           NamedNodeMap attrs = node.getAttributes();
           String target = null;
           boolean noFollow = false;
+          boolean post = false;
           for (int i= 0; i < attrs.getLength(); i++ ) {
             Node attr = attrs.item(i);
             String attrName = attr.getNodeName();
-
-            if ("rel".equalsIgnoreCase(attrName) &&
-                "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
-              noFollow = true;
-            }
-
             if (params.attrName.equalsIgnoreCase(attrName)) {
               target = attr.getNodeValue();
+            } else if ("rel".equalsIgnoreCase(attrName) &&
+                       "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+              noFollow = true;
+            } else if ("method".equalsIgnoreCase(attrName) &&
+                       "post".equalsIgnoreCase(attr.getNodeValue())) {
+              post = true;
             }
           }
-          if (target != null && !noFollow)
+          if (target != null && !noFollow && !post)
             try {
               URL url = new URL(base, target);
               outlinks.add(new Outlink(url.toString(),