You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by pk...@apache.org on 2006/03/25 12:19:37 UTC
svn commit: r388742 - in /lucene/nutch/branches/branch-0.7: CHANGES.txt
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
Author: pkosiorowski
Date: Sat Mar 25 03:19:31 2006
New Revision: 388742
URL: http://svn.apache.org/viewcvs?rev=388742&view=rev
Log:
Skipping post and nofollow outlinks
Modified:
lucene/nutch/branches/branch-0.7/CHANGES.txt
lucene/nutch/branches/branch-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
Modified: lucene/nutch/branches/branch-0.7/CHANGES.txt
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=388742&r1=388741&r2=388742&view=diff
==============================================================================
--- lucene/nutch/branches/branch-0.7/CHANGES.txt (original)
+++ lucene/nutch/branches/branch-0.7/CHANGES.txt Sat Mar 25 03:19:31 2006
@@ -22,6 +22,8 @@
9. Commons HTTPClient upgraded to version 3.0.
+10. Skipping "post" and "nofollow" outlinks.
+
Release 0.7.1 - 2005-10-01
Modified: lucene/nutch/branches/branch-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=388742&r1=388741&r2=388742&view=diff
==============================================================================
--- lucene/nutch/branches/branch-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ lucene/nutch/branches/branch-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Sat Mar 25 03:19:31 2006
@@ -296,23 +296,29 @@
if (node.getNodeType() == Node.ELEMENT_NODE) {
LinkParams params = (LinkParams)linkParams.get(node.getNodeName().toLowerCase());
if (params != null) {
- if (shouldThrowAwayLink(node, children, childLen, params)) {
- // this has no inner structure or just a single nested
- // anchor-- toss it!
- } else {
+ if (!shouldThrowAwayLink(node, children, childLen, params)) {
StringBuffer linkText = new StringBuffer();
getText(linkText, node, true);
NamedNodeMap attrs = node.getAttributes();
String target = null;
+ boolean noFollow = false;
+ boolean post = false;
for (int i= 0; i < attrs.getLength(); i++ ) {
- if (params.attrName.equalsIgnoreCase(attrs.item(i).getNodeName())) {
- target = attrs.item(i).getNodeValue();
- break;
+ Node attr = attrs.item(i);
+ String attrName = attr.getNodeName();
+ if (params.attrName.equalsIgnoreCase(attrName)) {
+ target = attr.getNodeValue();
+ } else if ("rel".equalsIgnoreCase(attrName) &&
+ "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+ noFollow = true;
+ } else if ("method".equalsIgnoreCase(attrName) &&
+ "post".equalsIgnoreCase(attr.getNodeValue())) {
+ post = true;
}
}
- if (target != null)
+ if (target != null && !noFollow && !post)
try {
URL url = new URL(base, target);
outlinks.add(new Outlink(url.toString(),