You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/11/23 20:55:14 UTC
svn commit: r348533 -
/lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
Author: cutting
Date: Wed Nov 23 11:55:11 2005
New Revision: 348533
URL: http://svn.apache.org/viewcvs?rev=348533&view=rev
Log:
Fix to not extract urls whose method=post.
Modified:
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=348533&r1=348532&r2=348533&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Wed Nov 23 11:55:11 2005
@@ -296,10 +296,7 @@
if (node.getNodeType() == Node.ELEMENT_NODE) {
LinkParams params = (LinkParams)linkParams.get(node.getNodeName().toLowerCase());
if (params != null) {
- if (shouldThrowAwayLink(node, children, childLen, params)) {
- // this has no inner structure or just a single nested
- // anchor-- toss it!
- } else {
+ if (!shouldThrowAwayLink(node, children, childLen, params)) {
StringBuffer linkText = new StringBuffer();
getText(linkText, node, true);
@@ -307,20 +304,21 @@
NamedNodeMap attrs = node.getAttributes();
String target = null;
boolean noFollow = false;
+ boolean post = false;
for (int i= 0; i < attrs.getLength(); i++ ) {
Node attr = attrs.item(i);
String attrName = attr.getNodeName();
-
- if ("rel".equalsIgnoreCase(attrName) &&
- "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
- noFollow = true;
- }
-
if (params.attrName.equalsIgnoreCase(attrName)) {
target = attr.getNodeValue();
+ } else if ("rel".equalsIgnoreCase(attrName) &&
+ "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+ noFollow = true;
+ } else if ("method".equalsIgnoreCase(attrName) &&
+ "post".equalsIgnoreCase(attr.getNodeValue())) {
+ post = true;
}
}
- if (target != null && !noFollow)
+ if (target != null && !noFollow && !post)
try {
URL url = new URL(base, target);
outlinks.add(new Outlink(url.toString(),