You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/10/21 23:04:57 UTC
svn commit: r327581 - in
/lucene/nutch/branches/mapred/src/plugin/parse-html/src:
java/org/apache/nutch/parse/html/DOMContentUtils.java
test/org/apache/nutch/parse/html/TestDOMContentUtils.java
Author: cutting
Date: Fri Oct 21 14:04:54 2005
New Revision: 327581
URL: http://svn.apache.org/viewcvs?rev=327581&view=rev
Log:
Ignore rel=nofollow links.
Modified:
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=327581&r1=327580&r2=327581&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Fri Oct 21 14:04:54 2005
@@ -306,13 +306,21 @@
NamedNodeMap attrs = node.getAttributes();
String target = null;
+ boolean noFollow = false;
for (int i= 0; i < attrs.getLength(); i++ ) {
- if (params.attrName.equalsIgnoreCase(attrs.item(i).getNodeName())) {
- target = attrs.item(i).getNodeValue();
- break;
+ Node attr = attrs.item(i);
+ String attrName = attr.getNodeName();
+
+ if ("rel".equalsIgnoreCase(attrName) &&
+ "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+ noFollow = true;
+ }
+
+ if (params.attrName.equalsIgnoreCase(attrName)) {
+ target = attr.getNodeValue();
}
}
- if (target != null)
+ if (target != null && !noFollow)
try {
URL url = new URL(base, target);
outlinks.add(new Outlink(url.toString(),
Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=327581&r1=327580&r2=327581&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Fri Oct 21 14:04:54 2005
@@ -113,6 +113,12 @@
+ "<h2>End\tthis\rmadness\n!</h2>\r\n"
+ " . . . ."
+ "</body> </html>"),
+
+ // test that <a rel=nofollow> links are not returned
+ new String("<html><head></head><body>"
+ + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
+ + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
+ + "</body></html>"),
};
private static String[] testBaseHrefs= {
@@ -123,6 +129,7 @@
"http://www.nutch.org/frames/",
"http://www.nutch.org/maps/",
"http://www.nutch.org/whitespace/",
+ "http://www.nutch.org//",
};
private static final DocumentFragment testDOMs[]=
@@ -145,6 +152,7 @@
+ "one two three space here space there no space "
+ "one two two three three four put some text here and there. "
+ "End this madness ! . . . .",
+ "ignore ignore",
};
private static final String[] answerTitle= {
@@ -155,6 +163,7 @@
"my title",
"my title",
"my title",
+ "",
};
// note: should be in page-order
@@ -214,6 +223,8 @@
{
new Outlink("http://www.nutch.org/index.html", "whitespace test"),
},
+ {
+ }
};
} catch (MalformedURLException e) {