You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/05/25 02:38:17 UTC
svn commit: r409275 - in /lucene/nutch/trunk: conf/ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/ src/plugin/parse-html/src/test/org/apache/nutch/parse/html/

Author: ab
Date: Wed May 24 17:38:16 2006
New Revision: 409275

URL: http://svn.apache.org/viewvc?rev=409275&view=rev
Log:
Fix for incorrect behavior (collecting action URLs from forms). This
is now optional, and turned off by default.

Update JUnit test to cover this option.

Modified:
    lucene/nutch/trunk/conf/nutch-default.xml
    lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
    lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
    lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=409275&r1=409274&r2=409275&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Wed May 24 17:38:16 2006
@@ -636,6 +636,16 @@
   </description>
 </property>
 
+<property>
+  <name>parser.html.form.use_action</name>
+  <value>false</value>
+  <description>If true, HTML parser will collect URLs from form action
+  attributes. This may lead to undesirable behavior (submitting empty
+  forms during next fetch cycle). If false, form action attribute will
+  be ignored.</description>
+</property>
+
+
 <!-- urlfilter plugin properties -->
 
 <property>

Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=409275&r1=409274&r2=409275&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Wed May 24 17:38:16 2006
@@ -51,17 +51,27 @@
       }
   }
   
-  public static HashMap linkParams = new HashMap();
+  private HashMap linkParams = new HashMap();
+  private Configuration conf;
   
-  static {
-      linkParams.put("a", new LinkParams("a", "href", 1));
-      linkParams.put("area", new LinkParams("area", "href", 0));
+  
+  public DOMContentUtils(Configuration conf) {
+    setConf(conf);
+  }
+  
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    linkParams.clear();
+    linkParams.put("a", new LinkParams("a", "href", 1));
+    linkParams.put("area", new LinkParams("area", "href", 0));
+    if (conf.getBoolean("parser.html.form.use_action", false)) {
       linkParams.put("form", new LinkParams("form", "action", 1));
-      linkParams.put("frame", new LinkParams("frame", "src", 0));
-      linkParams.put("iframe", new LinkParams("iframe", "src", 0));
-      linkParams.put("script", new LinkParams("script", "src", 0));
-      linkParams.put("link", new LinkParams("link", "href", 0));
-      linkParams.put("img", new LinkParams("img", "src", 0));
+    }
+    linkParams.put("frame", new LinkParams("frame", "src", 0));
+    linkParams.put("iframe", new LinkParams("iframe", "src", 0));
+    linkParams.put("script", new LinkParams("script", "src", 0));
+    linkParams.put("link", new LinkParams("link", "href", 0));
+    linkParams.put("img", new LinkParams("img", "src", 0));
   }
   
   /**
@@ -79,7 +89,7 @@
    *
    * @return true if nested anchors were found
    */
-  public static final boolean getText(StringBuffer sb, Node node, 
+  public boolean getText(StringBuffer sb, Node node, 
                                       boolean abortOnNestedAnchors) {
     if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
       return true;
@@ -93,13 +103,13 @@
    * #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
    * 
    */
-  public static final void getText(StringBuffer sb, Node node) {
+  public void getText(StringBuffer sb, Node node) {
     getText(sb, node, false);
   }
 
   // returns true if abortOnNestedAnchors is true and we find nested 
   // anchors
-  private static final boolean getTextHelper(StringBuffer sb, Node node, 
+  private boolean getTextHelper(StringBuffer sb, Node node, 
                                              boolean abortOnNestedAnchors,
                                              int anchorDepth) {
     if ("script".equalsIgnoreCase(node.getNodeName())) {
@@ -148,7 +158,7 @@
    *
    * @return true if a title node was found, false otherwise
    */
-  public static final boolean getTitle(StringBuffer sb, Node node) {
+  public boolean getTitle(StringBuffer sb, Node node) {
     if ("body".equalsIgnoreCase(node.getNodeName())) // stop after HEAD
       return false;
 
@@ -171,7 +181,7 @@
   }
 
   /** If Node contains a BASE tag then it's HREF is returned. */
-  public static final URL getBase(Node node) {
+  public URL getBase(Node node) {
 
     // is this node a BASE tag?
     if (node.getNodeType() == Node.ELEMENT_NODE) {
@@ -209,7 +219,7 @@
   }
 
 
-  private static boolean hasOnlyWhiteSpace(Node node) {
+  private boolean hasOnlyWhiteSpace(Node node) {
     String val= node.getNodeValue();
     for (int i= 0; i < val.length(); i++) {
       if (!Character.isWhitespace(val.charAt(i)))
@@ -220,7 +230,7 @@
 
   // this only covers a few cases of empty links that are symptomatic
   // of nekohtml's DOM-fixup process...
-  private static boolean shouldThrowAwayLink(Node node, NodeList children, 
+  private boolean shouldThrowAwayLink(Node node, NodeList children, 
                                               int childLen, LinkParams params) {
     if (childLen == 0) {
       // this has no inner structure 
@@ -286,8 +296,8 @@
    * nodes (this is a common DOM-fixup artifact, at least with
    * nekohtml).
    */
-  public static final void getOutlinks(URL base, ArrayList outlinks, 
-                                       Node node, Configuration conf) {
+  public void getOutlinks(URL base, ArrayList outlinks, 
+                                       Node node) {
 
     NodeList children = node.getChildNodes();
     int childLen= 0;
@@ -295,7 +305,8 @@
       childLen= children.getLength();
   
     if (node.getNodeType() == Node.ELEMENT_NODE) {
-      LinkParams params = (LinkParams)linkParams.get(node.getNodeName().toLowerCase());
+      String nodeName = node.getNodeName().toLowerCase();
+      LinkParams params = (LinkParams)linkParams.get(nodeName);
       if (params != null) {
         if (!shouldThrowAwayLink(node, children, childLen, params)) {
 
@@ -333,7 +344,7 @@
       }
     }
     for ( int i = 0; i < childLen; i++ ) {
-      getOutlinks(base, outlinks, children.item(i), conf);
+      getOutlinks(base, outlinks, children.item(i));
     }
   }
 

Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=409275&r1=409274&r2=409275&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Wed May 24 17:38:16 2006
@@ -95,6 +95,8 @@
   private String defaultCharEncoding;
 
   private Configuration conf;
+  
+  private DOMContentUtils utils;
 
   private HtmlParseFilters htmlParseFilters;
 
@@ -172,19 +174,19 @@
     if (!metaTags.getNoIndex()) {               // okay to index
       StringBuffer sb = new StringBuffer();
       LOG.fine("Getting text...");
-      DOMContentUtils.getText(sb, root);          // extract text
+      utils.getText(sb, root);          // extract text
       text = sb.toString();
       sb.setLength(0);
       LOG.fine("Getting title...");
-      DOMContentUtils.getTitle(sb, root);         // extract title
+      utils.getTitle(sb, root);         // extract title
       title = sb.toString().trim();
     }
       
     if (!metaTags.getNoFollow()) {              // okay to follow links
       ArrayList l = new ArrayList();              // extract outlinks
-      URL baseTag = DOMContentUtils.getBase(root);
+      URL baseTag = utils.getBase(root);
       LOG.fine("Getting links...");
-      DOMContentUtils.getOutlinks(baseTag!=null?baseTag:base, l, root, getConf());
+      utils.getOutlinks(baseTag!=null?baseTag:base, l, root);
       outlinks = (Outlink[])l.toArray(new Outlink[l.size()]);
       LOG.fine("found "+outlinks.length+" outlinks in "+content.getUrl());
     }
@@ -286,6 +288,7 @@
     this.parserImpl = getConf().get("parser.html.impl", "neko");
     this.defaultCharEncoding = getConf().get(
         "parser.character.encoding.default", "windows-1252");
+    this.utils = new DOMContentUtils(conf);
   }
 
   public Configuration getConf() {

Modified: lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=409275&r1=409274&r2=409275&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Wed May 24 17:38:16 2006
@@ -121,7 +121,21 @@
                + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
                + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
                + "</body></html>"),
+    // test that POST form actions are skipped
+    new String("<html><head></head><body>"
+            + "<form method='POST' action='/search.jsp'><input type=text>"
+            + "<input type=submit><p>test1</p></form>"
+            + "<form method='GET' action='/dummy.jsp'><input type=text>"
+            + "<input type=submit><p>test2</p></form></body></html>"),
+    // test that all form actions are skipped
+    new String("<html><head></head><body>"
+            + "<form method='POST' action='/search.jsp'><input type=text>"
+            + "<input type=submit><p>test1</p></form>"
+            + "<form method='GET' action='/dummy.jsp'><input type=text>"
+            + "<input type=submit><p>test2</p></form></body></html>"),
   };
+  
+  private static int SKIP = 9;
 
   private static String[] testBaseHrefs= {
     "http://www.nutch.org",     
@@ -132,8 +146,10 @@
     "http://www.nutch.org/maps/",
     "http://www.nutch.org/whitespace/",
     "http://www.nutch.org//",
+    "http://www.nutch.org/",
+    "http://www.nutch.org/",
   };
-  
+    
   private static final DocumentFragment testDOMs[]=
     new DocumentFragment[testPages.length];
 
@@ -155,6 +171,8 @@
         + "one two two three three four put some text here and there. "
         + "End this madness ! . . . .",
     "ignore ignore",
+    "test1 test2",
+    "test1 test2"
   };
 
   private static final String[] answerTitle= {
@@ -166,17 +184,24 @@
     "my title",
     "my title",
     "",
+    "",
+    ""
   };
 
   // note: should be in page-order
   private static Outlink[][] answerOutlinks;
   
+  private static Configuration conf;
+  private static DOMContentUtils utils = null;
+  
   public TestDOMContentUtils(String name) { 
     super(name); 
   }
 
   private static void setup() {
-    Configuration conf = NutchConfiguration.create();
+    conf = NutchConfiguration.create();
+    conf.setBoolean("parser.html.form.use_action", true);
+    utils = new DOMContentUtils(conf);
     DOMFragmentParser parser= new DOMFragmentParser();
     for (int i= 0; i < testPages.length; i++) {
         DocumentFragment node= 
@@ -227,6 +252,11 @@
              new Outlink("http://www.nutch.org/index.html", "whitespace test", conf),
          },
          {
+         },
+         {
+           new Outlink("http://www.nutch.org/dummy.jsp", "test2", conf),
+         },
+         {
          }
       };
    
@@ -255,7 +285,7 @@
       setup();
     for (int i= 0; i < testPages.length; i++) {
       StringBuffer sb= new StringBuffer();
-      DOMContentUtils.getText(sb, testDOMs[i]);
+      utils.getText(sb, testDOMs[i]);
       String text= sb.toString();
       assertTrue("expecting text: " + answerText[i] 
                  + System.getProperty("line.separator") 
@@ -270,7 +300,7 @@
       setup();
     for (int i= 0; i < testPages.length; i++) {
       StringBuffer sb= new StringBuffer();
-      DOMContentUtils.getTitle(sb, testDOMs[i]);
+      utils.getTitle(sb, testDOMs[i]);
       String text= sb.toString();
       assertTrue("expecting text: " + answerText[i] 
                  + System.getProperty("line.separator") 
@@ -285,7 +315,14 @@
       setup();
     for (int i= 0; i < testPages.length; i++) {
       ArrayList outlinks= new ArrayList();
-      DOMContentUtils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i], NutchConfiguration.create());
+      if (i == SKIP) {
+        conf.setBoolean("parser.html.form.use_action", false);
+        utils.setConf(conf);
+      } else {
+        conf.setBoolean("parser.html.form.use_action", true);
+        utils.setConf(conf);
+      }
+      utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
       Outlink[] outlinkArr= new Outlink[outlinks.size()];
       outlinkArr= (Outlink[]) outlinks.toArray(outlinkArr);
       compareOutlinks(answerOutlinks[i], outlinkArr);