You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/05/25 02:38:17 UTC
svn commit: r409275 - in /lucene/nutch/trunk: conf/
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/
src/plugin/parse-html/src/test/org/apache/nutch/parse/html/
Author: ab
Date: Wed May 24 17:38:16 2006
New Revision: 409275
URL: http://svn.apache.org/viewvc?rev=409275&view=rev
Log:
Fix for incorrect behavior (collecting action URLs from forms). This
is now optional, and turned off by default.
Update JUnit test to cover this option.
Modified:
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=409275&r1=409274&r2=409275&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Wed May 24 17:38:16 2006
@@ -636,6 +636,16 @@
</description>
</property>
+<property>
+ <name>parser.html.form.use_action</name>
+ <value>false</value>
+ <description>If true, HTML parser will collect URLs from form action
+ attributes. This may lead to undesirable behavior (submitting empty
+ forms during next fetch cycle). If false, form action attribute will
+ be ignored.</description>
+</property>
+
+
<!-- urlfilter plugin properties -->
<property>
Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=409275&r1=409274&r2=409275&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Wed May 24 17:38:16 2006
@@ -51,17 +51,27 @@
}
}
- public static HashMap linkParams = new HashMap();
+ private HashMap linkParams = new HashMap();
+ private Configuration conf;
- static {
- linkParams.put("a", new LinkParams("a", "href", 1));
- linkParams.put("area", new LinkParams("area", "href", 0));
+
+ public DOMContentUtils(Configuration conf) {
+ setConf(conf);
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ linkParams.clear();
+ linkParams.put("a", new LinkParams("a", "href", 1));
+ linkParams.put("area", new LinkParams("area", "href", 0));
+ if (conf.getBoolean("parser.html.form.use_action", false)) {
linkParams.put("form", new LinkParams("form", "action", 1));
- linkParams.put("frame", new LinkParams("frame", "src", 0));
- linkParams.put("iframe", new LinkParams("iframe", "src", 0));
- linkParams.put("script", new LinkParams("script", "src", 0));
- linkParams.put("link", new LinkParams("link", "href", 0));
- linkParams.put("img", new LinkParams("img", "src", 0));
+ }
+ linkParams.put("frame", new LinkParams("frame", "src", 0));
+ linkParams.put("iframe", new LinkParams("iframe", "src", 0));
+ linkParams.put("script", new LinkParams("script", "src", 0));
+ linkParams.put("link", new LinkParams("link", "href", 0));
+ linkParams.put("img", new LinkParams("img", "src", 0));
}
/**
@@ -79,7 +89,7 @@
*
* @return true if nested anchors were found
*/
- public static final boolean getText(StringBuffer sb, Node node,
+ public boolean getText(StringBuffer sb, Node node,
boolean abortOnNestedAnchors) {
if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
return true;
@@ -93,13 +103,13 @@
* #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
*
*/
- public static final void getText(StringBuffer sb, Node node) {
+ public void getText(StringBuffer sb, Node node) {
getText(sb, node, false);
}
// returns true if abortOnNestedAnchors is true and we find nested
// anchors
- private static final boolean getTextHelper(StringBuffer sb, Node node,
+ private boolean getTextHelper(StringBuffer sb, Node node,
boolean abortOnNestedAnchors,
int anchorDepth) {
if ("script".equalsIgnoreCase(node.getNodeName())) {
@@ -148,7 +158,7 @@
*
* @return true if a title node was found, false otherwise
*/
- public static final boolean getTitle(StringBuffer sb, Node node) {
+ public boolean getTitle(StringBuffer sb, Node node) {
if ("body".equalsIgnoreCase(node.getNodeName())) // stop after HEAD
return false;
@@ -171,7 +181,7 @@
}
/** If Node contains a BASE tag then it's HREF is returned. */
- public static final URL getBase(Node node) {
+ public URL getBase(Node node) {
// is this node a BASE tag?
if (node.getNodeType() == Node.ELEMENT_NODE) {
@@ -209,7 +219,7 @@
}
- private static boolean hasOnlyWhiteSpace(Node node) {
+ private boolean hasOnlyWhiteSpace(Node node) {
String val= node.getNodeValue();
for (int i= 0; i < val.length(); i++) {
if (!Character.isWhitespace(val.charAt(i)))
@@ -220,7 +230,7 @@
// this only covers a few cases of empty links that are symptomatic
// of nekohtml's DOM-fixup process...
- private static boolean shouldThrowAwayLink(Node node, NodeList children,
+ private boolean shouldThrowAwayLink(Node node, NodeList children,
int childLen, LinkParams params) {
if (childLen == 0) {
// this has no inner structure
@@ -286,8 +296,8 @@
* nodes (this is a common DOM-fixup artifact, at least with
* nekohtml).
*/
- public static final void getOutlinks(URL base, ArrayList outlinks,
- Node node, Configuration conf) {
+ public void getOutlinks(URL base, ArrayList outlinks,
+ Node node) {
NodeList children = node.getChildNodes();
int childLen= 0;
@@ -295,7 +305,8 @@
childLen= children.getLength();
if (node.getNodeType() == Node.ELEMENT_NODE) {
- LinkParams params = (LinkParams)linkParams.get(node.getNodeName().toLowerCase());
+ String nodeName = node.getNodeName().toLowerCase();
+ LinkParams params = (LinkParams)linkParams.get(nodeName);
if (params != null) {
if (!shouldThrowAwayLink(node, children, childLen, params)) {
@@ -333,7 +344,7 @@
}
}
for ( int i = 0; i < childLen; i++ ) {
- getOutlinks(base, outlinks, children.item(i), conf);
+ getOutlinks(base, outlinks, children.item(i));
}
}
Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=409275&r1=409274&r2=409275&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Wed May 24 17:38:16 2006
@@ -95,6 +95,8 @@
private String defaultCharEncoding;
private Configuration conf;
+
+ private DOMContentUtils utils;
private HtmlParseFilters htmlParseFilters;
@@ -172,19 +174,19 @@
if (!metaTags.getNoIndex()) { // okay to index
StringBuffer sb = new StringBuffer();
LOG.fine("Getting text...");
- DOMContentUtils.getText(sb, root); // extract text
+ utils.getText(sb, root); // extract text
text = sb.toString();
sb.setLength(0);
LOG.fine("Getting title...");
- DOMContentUtils.getTitle(sb, root); // extract title
+ utils.getTitle(sb, root); // extract title
title = sb.toString().trim();
}
if (!metaTags.getNoFollow()) { // okay to follow links
ArrayList l = new ArrayList(); // extract outlinks
- URL baseTag = DOMContentUtils.getBase(root);
+ URL baseTag = utils.getBase(root);
LOG.fine("Getting links...");
- DOMContentUtils.getOutlinks(baseTag!=null?baseTag:base, l, root, getConf());
+ utils.getOutlinks(baseTag!=null?baseTag:base, l, root);
outlinks = (Outlink[])l.toArray(new Outlink[l.size()]);
LOG.fine("found "+outlinks.length+" outlinks in "+content.getUrl());
}
@@ -286,6 +288,7 @@
this.parserImpl = getConf().get("parser.html.impl", "neko");
this.defaultCharEncoding = getConf().get(
"parser.character.encoding.default", "windows-1252");
+ this.utils = new DOMContentUtils(conf);
}
public Configuration getConf() {
Modified: lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=409275&r1=409274&r2=409275&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Wed May 24 17:38:16 2006
@@ -121,7 +121,21 @@
+ "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
+ "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
+ "</body></html>"),
+ // test that POST form actions are skipped
+ new String("<html><head></head><body>"
+ + "<form method='POST' action='/search.jsp'><input type=text>"
+ + "<input type=submit><p>test1</p></form>"
+ + "<form method='GET' action='/dummy.jsp'><input type=text>"
+ + "<input type=submit><p>test2</p></form></body></html>"),
+ // test that all form actions are skipped
+ new String("<html><head></head><body>"
+ + "<form method='POST' action='/search.jsp'><input type=text>"
+ + "<input type=submit><p>test1</p></form>"
+ + "<form method='GET' action='/dummy.jsp'><input type=text>"
+ + "<input type=submit><p>test2</p></form></body></html>"),
};
+
+ private static int SKIP = 9;
private static String[] testBaseHrefs= {
"http://www.nutch.org",
@@ -132,8 +146,10 @@
"http://www.nutch.org/maps/",
"http://www.nutch.org/whitespace/",
"http://www.nutch.org//",
+ "http://www.nutch.org/",
+ "http://www.nutch.org/",
};
-
+
private static final DocumentFragment testDOMs[]=
new DocumentFragment[testPages.length];
@@ -155,6 +171,8 @@
+ "one two two three three four put some text here and there. "
+ "End this madness ! . . . .",
"ignore ignore",
+ "test1 test2",
+ "test1 test2"
};
private static final String[] answerTitle= {
@@ -166,17 +184,24 @@
"my title",
"my title",
"",
+ "",
+ ""
};
// note: should be in page-order
private static Outlink[][] answerOutlinks;
+ private static Configuration conf;
+ private static DOMContentUtils utils = null;
+
public TestDOMContentUtils(String name) {
super(name);
}
private static void setup() {
- Configuration conf = NutchConfiguration.create();
+ conf = NutchConfiguration.create();
+ conf.setBoolean("parser.html.form.use_action", true);
+ utils = new DOMContentUtils(conf);
DOMFragmentParser parser= new DOMFragmentParser();
for (int i= 0; i < testPages.length; i++) {
DocumentFragment node=
@@ -227,6 +252,11 @@
new Outlink("http://www.nutch.org/index.html", "whitespace test", conf),
},
{
+ },
+ {
+ new Outlink("http://www.nutch.org/dummy.jsp", "test2", conf),
+ },
+ {
}
};
@@ -255,7 +285,7 @@
setup();
for (int i= 0; i < testPages.length; i++) {
StringBuffer sb= new StringBuffer();
- DOMContentUtils.getText(sb, testDOMs[i]);
+ utils.getText(sb, testDOMs[i]);
String text= sb.toString();
assertTrue("expecting text: " + answerText[i]
+ System.getProperty("line.separator")
@@ -270,7 +300,7 @@
setup();
for (int i= 0; i < testPages.length; i++) {
StringBuffer sb= new StringBuffer();
- DOMContentUtils.getTitle(sb, testDOMs[i]);
+ utils.getTitle(sb, testDOMs[i]);
String text= sb.toString();
assertTrue("expecting text: " + answerText[i]
+ System.getProperty("line.separator")
@@ -285,7 +315,14 @@
setup();
for (int i= 0; i < testPages.length; i++) {
ArrayList outlinks= new ArrayList();
- DOMContentUtils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i], NutchConfiguration.create());
+ if (i == SKIP) {
+ conf.setBoolean("parser.html.form.use_action", false);
+ utils.setConf(conf);
+ } else {
+ conf.setBoolean("parser.html.form.use_action", true);
+ utils.setConf(conf);
+ }
+ utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
Outlink[] outlinkArr= new Outlink[outlinks.size()];
outlinkArr= (Outlink[]) outlinks.toArray(outlinkArr);
compareOutlinks(answerOutlinks[i], outlinkArr);