You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ku...@apache.org on 2007/03/10 04:55:24 UTC
svn commit: r516648 - in /lucene/nutch/trunk/src/plugin/parse-html/src: java/org/apache/nutch/parse/html/DOMContentUtils.java test/org/apache/nutch/parse/html/TestDOMContentUtils.java

Author: kubes
Date: Fri Mar  9 19:55:23 2007
New Revision: 516648

URL: http://svn.apache.org/viewvc?view=rev&rev=516648
Log:
NUTCH-436 resolved.  Fixed behavior of urls with param 
(i.e. ;xxxx) information.  My problem with EOL characters on 
commit should be resolved.

Modified:
    lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
    lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java

Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?view=diff&rev=516648&r1=516647&r2=516648
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Fri Mar  9 19:55:23 2007
@@ -1,353 +1,400 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.html;
-
-import java.net.URL;
-import java.net.MalformedURLException;
-import java.util.ArrayList;
-import java.util.HashMap;
-
-import org.apache.nutch.parse.Outlink;
-import org.apache.hadoop.conf.Configuration;
-
-import org.w3c.dom.*;
-
-/**
- * A collection of methods for extracting content from DOM trees.
- * 
- * This class holds a few utility methods for pulling content out of 
- * DOM nodes, such as getOutlinks, getText, etc.
- *
- */
-public class DOMContentUtils {
-
-  public static class LinkParams {
-    public String elName;
-    public String attrName;
-      public int childLen;
-      
-      public LinkParams(String elName, String attrName, int childLen) {
-          this.elName = elName;
-          this.attrName = attrName;
-          this.childLen = childLen;
-      }
-      
-      public String toString() {
-          return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]";
-      }
-  }
-  
-  private HashMap linkParams = new HashMap();
-  private Configuration conf;
-  
-  
-  public DOMContentUtils(Configuration conf) {
-    setConf(conf);
-  }
-  
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    linkParams.clear();
-    linkParams.put("a", new LinkParams("a", "href", 1));
-    linkParams.put("area", new LinkParams("area", "href", 0));
-    if (conf.getBoolean("parser.html.form.use_action", false)) {
-      linkParams.put("form", new LinkParams("form", "action", 1));
-    }
-    linkParams.put("frame", new LinkParams("frame", "src", 0));
-    linkParams.put("iframe", new LinkParams("iframe", "src", 0));
-    linkParams.put("script", new LinkParams("script", "src", 0));
-    linkParams.put("link", new LinkParams("link", "href", 0));
-    linkParams.put("img", new LinkParams("img", "src", 0));
-  }
-  
-  /**
-   * This method takes a {@link StringBuffer} and a DOM {@link Node},
-   * and will append all the content text found beneath the DOM node to 
-   * the <code>StringBuffer</code>.
-   *
-   * <p>
-   *
-   * If <code>abortOnNestedAnchors</code> is true, DOM traversal will
-   * be aborted and the <code>StringBuffer</code> will not contain
-   * any text encountered after a nested anchor is found.
-   * 
-   * <p>
-   *
-   * @return true if nested anchors were found
-   */
-  public boolean getText(StringBuffer sb, Node node, 
-                                      boolean abortOnNestedAnchors) {
-    if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
-      return true;
-    } 
-    return false;
-  }
-
-
-  /**
-   * This is a convinience method, equivalent to {@link
-   * #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
-   * 
-   */
-  public void getText(StringBuffer sb, Node node) {
-    getText(sb, node, false);
-  }
-
-  // returns true if abortOnNestedAnchors is true and we find nested 
-  // anchors
-  private boolean getTextHelper(StringBuffer sb, Node node, 
-                                             boolean abortOnNestedAnchors,
-                                             int anchorDepth) {
-    if ("script".equalsIgnoreCase(node.getNodeName())) {
-      return false;
-    }
-    if ("style".equalsIgnoreCase(node.getNodeName())) {
-      return false;
-    }
-    if (abortOnNestedAnchors && "a".equalsIgnoreCase(node.getNodeName())) {
-      anchorDepth++;
-      if (anchorDepth > 1)
-        return true;
-    }
-    if (node.getNodeType() == Node.COMMENT_NODE) {
-      return false;
-    }
-    if (node.getNodeType() == Node.TEXT_NODE) {
-      // cleanup and trim the value
-      String text = node.getNodeValue();
-      text = text.replaceAll("\\s+", " ");
-      text = text.trim();
-      if (text.length() > 0) {
-        if (sb.length() > 0) sb.append(' ');
-      	sb.append(text);
-      }
-    }
-    boolean abort = false;
-    NodeList children = node.getChildNodes();
-    if (children != null) {
-      int len = children.getLength();
-      for (int i = 0; i < len; i++) {
-        if (getTextHelper(sb, children.item(i), 
-                          abortOnNestedAnchors, anchorDepth)) {
-          abort = true;
-          break;
-        }
-      }
-    }
-    return abort;
-  }
-
-  /**
-   * This method takes a {@link StringBuffer} and a DOM {@link Node},
-   * and will append the content text found beneath the first
-   * <code>title</code> node to the <code>StringBuffer</code>.
-   *
-   * @return true if a title node was found, false otherwise
-   */
-  public boolean getTitle(StringBuffer sb, Node node) {
-    if ("body".equalsIgnoreCase(node.getNodeName())) // stop after HEAD
-      return false;
-
-    if (node.getNodeType() == Node.ELEMENT_NODE) {
-      if ("title".equalsIgnoreCase(node.getNodeName())) {
-        getText(sb, node);
-        return true;
-      }
-    }
-    NodeList children = node.getChildNodes();
-    if (children != null) {
-      int len = children.getLength();
-      for (int i = 0; i < len; i++) {
-        if (getTitle(sb, children.item(i))) {
-          return true;
-        }
-      }
-    }
-    return false;
-  }
-
-  /** If Node contains a BASE tag then it's HREF is returned. */
-  public URL getBase(Node node) {
-
-    // is this node a BASE tag?
-    if (node.getNodeType() == Node.ELEMENT_NODE) {
-
-      if ("body".equalsIgnoreCase(node.getNodeName())) // stop after HEAD
-        return null;
-
-
-      if ("base".equalsIgnoreCase(node.getNodeName())) {
-        NamedNodeMap attrs = node.getAttributes();
-        for (int i= 0; i < attrs.getLength(); i++ ) {
-          Node attr = attrs.item(i);
-          if ("href".equalsIgnoreCase(attr.getNodeName())) {
-            try {
-              return new URL(attr.getNodeValue());
-            } catch (MalformedURLException e) {}
-          }
-        }
-      }
-    }
-    
-    // does it contain a base tag?
-    NodeList children = node.getChildNodes();
-    if (children != null) {
-      int len = children.getLength();
-      for (int i = 0; i < len; i++) {
-        URL base = getBase(children.item(i));
-        if (base != null)
-          return base;
-      }
-    }
-
-    // no.
-    return null;
-  }
-
-
-  private boolean hasOnlyWhiteSpace(Node node) {
-    String val= node.getNodeValue();
-    for (int i= 0; i < val.length(); i++) {
-      if (!Character.isWhitespace(val.charAt(i)))
-        return false;
-    }
-    return true;
-  }
-
-  // this only covers a few cases of empty links that are symptomatic
-  // of nekohtml's DOM-fixup process...
-  private boolean shouldThrowAwayLink(Node node, NodeList children, 
-                                              int childLen, LinkParams params) {
-    if (childLen == 0) {
-      // this has no inner structure 
-      if (params.childLen == 0) return false;
-      else return true;
-    } else if ((childLen == 1) 
-               && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
-               && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { 
-      // single nested link
-      return true;
-
-    } else if (childLen == 2) {
-
-      Node c0= children.item(0);
-      Node c1= children.item(1);
-
-      if ((c0.getNodeType() == Node.ELEMENT_NODE)
-          && (params.elName.equalsIgnoreCase(c0.getNodeName()))
-          && (c1.getNodeType() == Node.TEXT_NODE) 
-          && hasOnlyWhiteSpace(c1) ) {
-        // single link followed by whitespace node
-        return true;
-      }
-
-      if ((c1.getNodeType() == Node.ELEMENT_NODE)
-          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
-          && (c0.getNodeType() == Node.TEXT_NODE) 
-          && hasOnlyWhiteSpace(c0) ) {
-        // whitespace node followed by single link
-        return true;
-      }
-
-    } else if (childLen == 3) {
-      Node c0= children.item(0);
-      Node c1= children.item(1);
-      Node c2= children.item(2);
-      
-      if ((c1.getNodeType() == Node.ELEMENT_NODE)
-          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
-          && (c0.getNodeType() == Node.TEXT_NODE) 
-          && (c2.getNodeType() == Node.TEXT_NODE) 
-          && hasOnlyWhiteSpace(c0)
-          && hasOnlyWhiteSpace(c2) ) {
-        // single link surrounded by whitespace nodes
-        return true;
-      }
-    }
-
-    return false;
-  }
-
-  /**
-   * This method finds all anchors below the supplied DOM
-   * <code>node</code>, and creates appropriate {@link Outlink}
-   * records for each (relative to the supplied <code>base</code>
-   * URL), and adds them to the <code>outlinks</code> {@link
-   * ArrayList}.
-   *
-   * <p>
-   *
-   * Links without inner structure (tags, text, etc) are discarded, as
-   * are links which contain only single nested links and empty text
-   * nodes (this is a common DOM-fixup artifact, at least with
-   * nekohtml).
-   */
-  public void getOutlinks(URL base, ArrayList outlinks, 
-                                       Node node) {
-
-    NodeList children = node.getChildNodes();
-    int childLen= 0;
-    if (children != null)
-      childLen= children.getLength();
-  
-    if (node.getNodeType() == Node.ELEMENT_NODE) {
-      String nodeName = node.getNodeName().toLowerCase();
-      LinkParams params = (LinkParams)linkParams.get(nodeName);
-      if (params != null) {
-        if (!shouldThrowAwayLink(node, children, childLen, params)) {
-
-          StringBuffer linkText = new StringBuffer();
-          getText(linkText, node, true);
-
-          NamedNodeMap attrs = node.getAttributes();
-          String target = null;
-          boolean noFollow = false;
-          boolean post = false;
-          for (int i= 0; i < attrs.getLength(); i++ ) {
-            Node attr = attrs.item(i);
-            String attrName = attr.getNodeName();
-            if (params.attrName.equalsIgnoreCase(attrName)) {
-              target = attr.getNodeValue();
-            } else if ("rel".equalsIgnoreCase(attrName) &&
-                       "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
-              noFollow = true;
-            } else if ("method".equalsIgnoreCase(attrName) &&
-                       "post".equalsIgnoreCase(attr.getNodeValue())) {
-              post = true;
-            }
-          }
-          if (target != null && !noFollow && !post)
-            try {
-              URL url = new URL(base, target);
-              outlinks.add(new Outlink(url.toString(),
-                                       linkText.toString().trim(), conf));
-            } catch (MalformedURLException e) {
-              // don't care
-            }
-        }
-        // this should not have any children, skip them
-        if (params.childLen == 0) return;
-      }
-    }
-    for ( int i = 0; i < childLen; i++ ) {
-      getOutlinks(base, outlinks, children.item(i));
-    }
-  }
-
-}
-
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import java.net.URL;
+import java.net.MalformedURLException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.hadoop.conf.Configuration;
+
+import org.w3c.dom.*;
+
+/**
+ * A collection of methods for extracting content from DOM trees.
+ * 
+ * This class holds a few utility methods for pulling content out of 
+ * DOM nodes, such as getOutlinks, getText, etc.
+ *
+ */
+public class DOMContentUtils {
+
+  public static class LinkParams {
+    public String elName;
+    public String attrName;
+      public int childLen;
+      
+      public LinkParams(String elName, String attrName, int childLen) {
+          this.elName = elName;
+          this.attrName = attrName;
+          this.childLen = childLen;
+      }
+      
+      public String toString() {
+          return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]";
+      }
+  }
+  
+  private HashMap linkParams = new HashMap();
+  private Configuration conf;
+  
+  
+  public DOMContentUtils(Configuration conf) {
+    setConf(conf);
+  }
+  
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    linkParams.clear();
+    linkParams.put("a", new LinkParams("a", "href", 1));
+    linkParams.put("area", new LinkParams("area", "href", 0));
+    if (conf.getBoolean("parser.html.form.use_action", false)) {
+      linkParams.put("form", new LinkParams("form", "action", 1));
+    }
+    linkParams.put("frame", new LinkParams("frame", "src", 0));
+    linkParams.put("iframe", new LinkParams("iframe", "src", 0));
+    linkParams.put("script", new LinkParams("script", "src", 0));
+    linkParams.put("link", new LinkParams("link", "href", 0));
+    linkParams.put("img", new LinkParams("img", "src", 0));
+  }
+  
+  /**
+   * This method takes a {@link StringBuffer} and a DOM {@link Node},
+   * and will append all the content text found beneath the DOM node to 
+   * the <code>StringBuffer</code>.
+   *
+   * <p>
+   *
+   * If <code>abortOnNestedAnchors</code> is true, DOM traversal will
+   * be aborted and the <code>StringBuffer</code> will not contain
+   * any text encountered after a nested anchor is found.
+   * 
+   * <p>
+   *
+   * @return true if nested anchors were found
+   */
+  public boolean getText(StringBuffer sb, Node node, 
+                                      boolean abortOnNestedAnchors) {
+    if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
+      return true;
+    } 
+    return false;
+  }
+
+
+  /**
+   * This is a convinience method, equivalent to {@link
+   * #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
+   * 
+   */
+  public void getText(StringBuffer sb, Node node) {
+    getText(sb, node, false);
+  }
+
+  // returns true if abortOnNestedAnchors is true and we find nested 
+  // anchors
+  private boolean getTextHelper(StringBuffer sb, Node node, 
+                                             boolean abortOnNestedAnchors,
+                                             int anchorDepth) {
+    if ("script".equalsIgnoreCase(node.getNodeName())) {
+      return false;
+    }
+    if ("style".equalsIgnoreCase(node.getNodeName())) {
+      return false;
+    }
+    if (abortOnNestedAnchors && "a".equalsIgnoreCase(node.getNodeName())) {
+      anchorDepth++;
+      if (anchorDepth > 1)
+        return true;
+    }
+    if (node.getNodeType() == Node.COMMENT_NODE) {
+      return false;
+    }
+    if (node.getNodeType() == Node.TEXT_NODE) {
+      // cleanup and trim the value
+      String text = node.getNodeValue();
+      text = text.replaceAll("\\s+", " ");
+      text = text.trim();
+      if (text.length() > 0) {
+        if (sb.length() > 0) sb.append(' ');
+      	sb.append(text);
+      }
+    }
+    boolean abort = false;
+    NodeList children = node.getChildNodes();
+    if (children != null) {
+      int len = children.getLength();
+      for (int i = 0; i < len; i++) {
+        if (getTextHelper(sb, children.item(i), 
+                          abortOnNestedAnchors, anchorDepth)) {
+          abort = true;
+          break;
+        }
+      }
+    }
+    return abort;
+  }
+
+  /**
+   * This method takes a {@link StringBuffer} and a DOM {@link Node},
+   * and will append the content text found beneath the first
+   * <code>title</code> node to the <code>StringBuffer</code>.
+   *
+   * @return true if a title node was found, false otherwise
+   */
+  public boolean getTitle(StringBuffer sb, Node node) {
+    if ("body".equalsIgnoreCase(node.getNodeName())) // stop after HEAD
+      return false;
+
+    if (node.getNodeType() == Node.ELEMENT_NODE) {
+      if ("title".equalsIgnoreCase(node.getNodeName())) {
+        getText(sb, node);
+        return true;
+      }
+    }
+    NodeList children = node.getChildNodes();
+    if (children != null) {
+      int len = children.getLength();
+      for (int i = 0; i < len; i++) {
+        if (getTitle(sb, children.item(i))) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  /** If Node contains a BASE tag then it's HREF is returned. */
+  public URL getBase(Node node) {
+
+    // is this node a BASE tag?
+    if (node.getNodeType() == Node.ELEMENT_NODE) {
+
+      if ("body".equalsIgnoreCase(node.getNodeName())) // stop after HEAD
+        return null;
+
+
+      if ("base".equalsIgnoreCase(node.getNodeName())) {
+        NamedNodeMap attrs = node.getAttributes();
+        for (int i= 0; i < attrs.getLength(); i++ ) {
+          Node attr = attrs.item(i);
+          if ("href".equalsIgnoreCase(attr.getNodeName())) {
+            try {
+              return new URL(attr.getNodeValue());
+            } catch (MalformedURLException e) {}
+          }
+        }
+      }
+    }
+    
+    // does it contain a base tag?
+    NodeList children = node.getChildNodes();
+    if (children != null) {
+      int len = children.getLength();
+      for (int i = 0; i < len; i++) {
+        URL base = getBase(children.item(i));
+        if (base != null)
+          return base;
+      }
+    }
+
+    // no.
+    return null;
+  }
+
+
+  private boolean hasOnlyWhiteSpace(Node node) {
+    String val= node.getNodeValue();
+    for (int i= 0; i < val.length(); i++) {
+      if (!Character.isWhitespace(val.charAt(i)))
+        return false;
+    }
+    return true;
+  }
+
+  // this only covers a few cases of empty links that are symptomatic
+  // of nekohtml's DOM-fixup process...
+  private boolean shouldThrowAwayLink(Node node, NodeList children, 
+                                              int childLen, LinkParams params) {
+    if (childLen == 0) {
+      // this has no inner structure 
+      if (params.childLen == 0) return false;
+      else return true;
+    } else if ((childLen == 1) 
+               && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
+               && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { 
+      // single nested link
+      return true;
+
+    } else if (childLen == 2) {
+
+      Node c0= children.item(0);
+      Node c1= children.item(1);
+
+      if ((c0.getNodeType() == Node.ELEMENT_NODE)
+          && (params.elName.equalsIgnoreCase(c0.getNodeName()))
+          && (c1.getNodeType() == Node.TEXT_NODE) 
+          && hasOnlyWhiteSpace(c1) ) {
+        // single link followed by whitespace node
+        return true;
+      }
+
+      if ((c1.getNodeType() == Node.ELEMENT_NODE)
+          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
+          && (c0.getNodeType() == Node.TEXT_NODE) 
+          && hasOnlyWhiteSpace(c0) ) {
+        // whitespace node followed by single link
+        return true;
+      }
+
+    } else if (childLen == 3) {
+      Node c0= children.item(0);
+      Node c1= children.item(1);
+      Node c2= children.item(2);
+      
+      if ((c1.getNodeType() == Node.ELEMENT_NODE)
+          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
+          && (c0.getNodeType() == Node.TEXT_NODE) 
+          && (c2.getNodeType() == Node.TEXT_NODE) 
+          && hasOnlyWhiteSpace(c0)
+          && hasOnlyWhiteSpace(c2) ) {
+        // single link surrounded by whitespace nodes
+        return true;
+      }
+    }
+
+    return false;
+  }
+  
+  /**
+   * Handles cases where the url param information is encoded into the base
+   * url as opposed to the target.
+   * <p>
+   * If the taget contains params (i.e. ';xxxx') information then the target 
+   * params information is assumed to be correct and any base params information
+   * is ignored.  If the base contains params information but the tareget does
+   * not, then the params information is moved to the target allowing it to be
+   * correctly determined by the java.net.URL class.
+   * 
+   * @param base The base URL.
+   * @param target The target path from the base URL.
+   * 
+   * @return URL A URL with the params information correctly encoded.
+   * 
+   * @throws MalformedURLException If the url is not a well formed URL.
+   */
+  private URL fixEmbeddedParams(URL base, String target) 
+    throws MalformedURLException{
+    
+    // the target contains params information or the base doesn't then no
+    // conversion necessary, return regular URL
+    if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
+      return new URL(base, target);
+    }
+    
+    // get the base url and it params information
+    String baseURL = base.toString();
+    int startParams = baseURL.indexOf(';');
+    String params = baseURL.substring(startParams);
+    
+    // if the target has a query string then put the params information after
+    // any path but before the query string, otherwise just append to the path
+    int startQS = target.indexOf('?');
+    if (startQS >= 0) {
+      target = target.substring(0, startQS) + params + 
+        target.substring(startQS);
+    }
+    else {
+      target += params;
+    }
+    
+    return new URL(base, target);
+  }
+
+  /**
+   * This method finds all anchors below the supplied DOM
+   * <code>node</code>, and creates appropriate {@link Outlink}
+   * records for each (relative to the supplied <code>base</code>
+   * URL), and adds them to the <code>outlinks</code> {@link
+   * ArrayList}.
+   *
+   * <p>
+   *
+   * Links without inner structure (tags, text, etc) are discarded, as
+   * are links which contain only single nested links and empty text
+   * nodes (this is a common DOM-fixup artifact, at least with
+   * nekohtml).
+   */
+  public void getOutlinks(URL base, ArrayList outlinks, 
+                                       Node node) {
+
+    NodeList children = node.getChildNodes();
+    int childLen= 0;
+    if (children != null)
+      childLen= children.getLength();
+  
+    if (node.getNodeType() == Node.ELEMENT_NODE) {
+      String nodeName = node.getNodeName().toLowerCase();
+      LinkParams params = (LinkParams)linkParams.get(nodeName);
+      if (params != null) {
+        if (!shouldThrowAwayLink(node, children, childLen, params)) {
+
+          StringBuffer linkText = new StringBuffer();
+          getText(linkText, node, true);
+
+          NamedNodeMap attrs = node.getAttributes();
+          String target = null;
+          boolean noFollow = false;
+          boolean post = false;
+          for (int i= 0; i < attrs.getLength(); i++ ) {
+            Node attr = attrs.item(i);
+            String attrName = attr.getNodeName();
+            if (params.attrName.equalsIgnoreCase(attrName)) {
+              target = attr.getNodeValue();
+            } else if ("rel".equalsIgnoreCase(attrName) &&
+                       "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+              noFollow = true;
+            } else if ("method".equalsIgnoreCase(attrName) &&
+                       "post".equalsIgnoreCase(attr.getNodeValue())) {
+              post = true;
+            }
+          }
+          if (target != null && !noFollow && !post)
+            try {
+              
+              URL url = (base.toString().indexOf(';') > 0) ? 
+                fixEmbeddedParams(base, target) :  new URL(base, target);
+              outlinks.add(new Outlink(url.toString(),
+                                       linkText.toString().trim(), conf));
+            } catch (MalformedURLException e) {
+              // don't care
+            }
+        }
+        // this should not have any children, skip them
+        if (params.childLen == 0) return;
+      }
+    }
+    for ( int i = 0; i < childLen; i++ ) {
+      getOutlinks(base, outlinks, children.item(i));
+    }
+  }
+
+}
+

Modified: lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?view=diff&rev=516648&r1=516647&r2=516648
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Fri Mar  9 19:55:23 2007
@@ -1,376 +1,408 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.html;
-
-import junit.framework.TestCase;
-
-import org.apache.nutch.parse.Outlink;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-
-import java.io.ByteArrayInputStream;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.StringTokenizer;
-
-import org.cyberneko.html.parsers.*;
-import org.xml.sax.*;
-import org.w3c.dom.*;
-import org.apache.html.dom.*;
-
-/** 
- * Unit tests for DOMContentUtils.
- */
-public class TestDOMContentUtils extends TestCase {
-
-  private static final String[] testPages= { 
-    new String("<html><head><title> title </title><script> script </script>"
-               + "</head><body> body <a href=\"http://www.nutch.org\">"
-               + " anchor </a><!--comment-->"
-               + "</body></html>"),
-    new String("<html><head><title> title </title><script> script </script>"
-               + "</head><body> body <a href=\"/\">"
-               + " home </a><!--comment-->"
-               + "<style> style </style>"
-               + " <a href=\"bot.html\">"
-               + " bots </a>"
-               + "</body></html>"),
-    new String("<html><head><title> </title>"
-               + "</head><body> "
-               + "<a href=\"/\"> separate this "
-               + "<a href=\"ok\"> from this"
-               + "</a></a>"
-               + "</body></html>"),
-    // this one relies on certain neko fixup behavior, possibly
-    // distributing the anchors into the LI's-but not the other
-    // anchors (outside of them, instead)!  So you get a tree that
-    // looks like:
-    // ... <li> <a href=/> home </a> </li>
-    //     <li> <a href=/> <a href="1"> 1 </a> </a> </li>
-    //     <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
-    new String("<html><head><title> my title </title>"
-               + "</head><body> body "
-               + "<ul>"
-               + "<li> <a href=\"/\"> home"
-               + "<li> <a href=\"1\"> 1"
-               + "<li> <a href=\"2\"> 2"
-               + "</ul>"
-               + "</body></html>"),
-    // test frameset link extraction. The invalid frame in the middle will be
-    // fixed to a third standalone frame.
-    new String("<html><head><title> my title </title>"
-               + "</head><frameset rows=\"20,*\"> "
-               + "<frame src=\"top.html\">"
-               + "</frame>"
-               + "<frameset cols=\"20,*\">"
-               + "<frame src=\"left.html\">"
-               + "<frame src=\"invalid.html\"/>"
-               + "</frame>"
-               + "<frame src=\"right.html\">"
-               + "</frame>"
-               + "</frameset>"
-               + "</frameset>"
-               + "</body></html>"),
-    // test <area> and <iframe> link extraction + url normalization
-    new String("<html><head><title> my title </title>"
-               + "</head><body>"
-               + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
-			   + "<map name=\"green\">"
-			   + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
-			   + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
-			   + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
-			   + "</map>"
-               + "<a name=\"bottom\"/><h1> the bottom </h1> "
-               + "<iframe src=\"../docs/index.html\"/>"
-               + "</body></html>"),
-    // test whitespace processing for plain text extraction
-    new String("<html><head>\n <title> my\t\n  title\r\n </title>\n"
-               + " </head>\n"
-               + " <body>\n"
-               + "    <h1> Whitespace\ttest  </h1> \n"
-               + "\t<a href=\"../index.html\">\n  \twhitespace  test\r\n\t</a>  \t\n"
-               + "    <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
-               + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
-               + "    This\t<b>is a</b> break -&gt;<br>and the line after<i> break</i>.<br>\n"
-               + "<table>"
-               + "    <tr><td>one</td><td>two</td><td>three</td></tr>\n"
-               + "    <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
-               + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
-               + "</table>put some text here<Br>and there."
-               + "<h2>End\tthis\rmadness\n!</h2>\r\n"
-               + "         .        .        .         ."
-               + "</body>  </html>"),
-
-    // test that <a rel=nofollow> links are not returned
-    new String("<html><head></head><body>"
-               + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
-               + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
-               + "</body></html>"),
-    // test that POST form actions are skipped
-    new String("<html><head></head><body>"
-            + "<form method='POST' action='/search.jsp'><input type=text>"
-            + "<input type=submit><p>test1</p></form>"
-            + "<form method='GET' action='/dummy.jsp'><input type=text>"
-            + "<input type=submit><p>test2</p></form></body></html>"),
-    // test that all form actions are skipped
-    new String("<html><head></head><body>"
-            + "<form method='POST' action='/search.jsp'><input type=text>"
-            + "<input type=submit><p>test1</p></form>"
-            + "<form method='GET' action='/dummy.jsp'><input type=text>"
-            + "<input type=submit><p>test2</p></form></body></html>"),
-  };
-  
-  private static int SKIP = 9;
-
-  private static String[] testBaseHrefs= {
-    "http://www.nutch.org",     
-    "http://www.nutch.org/docs/foo.html",     
-    "http://www.nutch.org/docs/",     
-    "http://www.nutch.org/docs/",
-    "http://www.nutch.org/frames/",     
-    "http://www.nutch.org/maps/",
-    "http://www.nutch.org/whitespace/",
-    "http://www.nutch.org//",
-    "http://www.nutch.org/",
-    "http://www.nutch.org/",
-  };
-    
-  private static final DocumentFragment testDOMs[]=
-    new DocumentFragment[testPages.length];
-
-  private static URL[] testBaseHrefURLs= 
-    new URL[testPages.length];
-
-
-  private static final String[] answerText= {
-    "title body anchor",
-    "title body home bots",
-    "separate this from this",
-    "my title body home 1 2",
-    "my title",
-    "my title the bottom",
-    "my title Whitespace test whitespace test "
-        + "This is a whitespace test . Newlines should appear as space too. "
-        + "Tabs are spaces too. This is a break -> and the line after break . "
-        + "one two three space here space there no space "
-        + "one two two three three four put some text here and there. "
-        + "End this madness ! . . . .",
-    "ignore ignore",
-    "test1 test2",
-    "test1 test2"
-  };
-
-  private static final String[] answerTitle= {
-    "title",
-    "title",
-    "",
-    "my title",
-    "my title",
-    "my title",
-    "my title",
-    "",
-    "",
-    ""
-  };
-
-  // note: should be in page-order
-  private static Outlink[][] answerOutlinks;
-  
-  private static Configuration conf;
-  private static DOMContentUtils utils = null;
-  
-  public TestDOMContentUtils(String name) { 
-    super(name); 
-  }
-
-  private static void setup() {
-    conf = NutchConfiguration.create();
-    conf.setBoolean("parser.html.form.use_action", true);
-    utils = new DOMContentUtils(conf);
-    DOMFragmentParser parser= new DOMFragmentParser();
-    for (int i= 0; i < testPages.length; i++) {
-        DocumentFragment node= 
-          new HTMLDocumentImpl().createDocumentFragment();
-        try {
-          parser.parse(
-            new InputSource( 
-              new ByteArrayInputStream(testPages[i].getBytes()) ),
-            node);
-          testBaseHrefURLs[i]= new URL(testBaseHrefs[i]);
-        } catch (Exception e) {
-          assertTrue("caught exception: " + e, false);
-        } 
-      testDOMs[i]= node;
-    }
-    try {
-     answerOutlinks = new Outlink[][]{ 
-         {
-           new Outlink("http://www.nutch.org", "anchor", conf),
-         },
-         {
-           new Outlink("http://www.nutch.org/", "home", conf),
-           new Outlink("http://www.nutch.org/docs/bot.html", "bots", conf),
-         },
-         {
-           new Outlink("http://www.nutch.org/", "separate this", conf),
-           new Outlink("http://www.nutch.org/docs/ok", "from this", conf),
-         },
-         {
-           new Outlink("http://www.nutch.org/", "home", conf),
-           new Outlink("http://www.nutch.org/docs/1", "1", conf),
-           new Outlink("http://www.nutch.org/docs/2", "2", conf),
-         },
-         {
-           new Outlink("http://www.nutch.org/frames/top.html", "", conf),
-           new Outlink("http://www.nutch.org/frames/left.html", "", conf),
-           new Outlink("http://www.nutch.org/frames/invalid.html", "", conf),
-           new Outlink("http://www.nutch.org/frames/right.html", "", conf),
-         },
-         {
-           new Outlink("http://www.nutch.org/maps/logo.gif", "", conf),
-           new Outlink("http://www.nutch.org/index.html", "", conf),
-           new Outlink("http://www.nutch.org/maps/#bottom", "", conf),
-           new Outlink("http://www.nutch.org/bot.html", "", conf),
-           new Outlink("http://www.nutch.org/docs/index.html", "", conf),
-         },
-         {
-             new Outlink("http://www.nutch.org/index.html", "whitespace test", conf),
-         },
-         {
-         },
-         {
-           new Outlink("http://www.nutch.org/dummy.jsp", "test2", conf),
-         },
-         {
-         }
-      };
-   
-    } catch (MalformedURLException e) {
-        
-    }
-  }
-
-  private static boolean equalsIgnoreWhitespace(String s1, String s2) {
-    StringTokenizer st1= new StringTokenizer(s1);
-    StringTokenizer st2= new StringTokenizer(s2);
-
-    while (st1.hasMoreTokens()) {
-      if (!st2.hasMoreTokens()) 
-        return false;
-      if ( ! st1.nextToken().equals(st2.nextToken()) )
-        return false;
-    }
-    if (st2.hasMoreTokens()) 
-      return false;
-    return true;
-  }
-
-  public void testGetText() {
-    if (testDOMs[0] == null) 
-      setup();
-    for (int i= 0; i < testPages.length; i++) {
-      StringBuffer sb= new StringBuffer();
-      utils.getText(sb, testDOMs[i]);
-      String text= sb.toString();
-      assertTrue("expecting text: " + answerText[i] 
-                 + System.getProperty("line.separator") 
-                 + System.getProperty("line.separator") 
-                 + "got text: "+ text, 
-                 equalsIgnoreWhitespace(answerText[i], text));
-    }
-  }
-
-  public void testGetTitle() {
-    if (testDOMs[0] == null) 
-      setup();
-    for (int i= 0; i < testPages.length; i++) {
-      StringBuffer sb= new StringBuffer();
-      utils.getTitle(sb, testDOMs[i]);
-      String text= sb.toString();
-      assertTrue("expecting text: " + answerText[i] 
-                 + System.getProperty("line.separator") 
-                 + System.getProperty("line.separator") 
-                 + "got text: "+ text, 
-                 equalsIgnoreWhitespace(answerTitle[i], text));
-    }
-  }
-
-  public void testGetOutlinks() {
-    if (testDOMs[0] == null) 
-      setup();
-    for (int i= 0; i < testPages.length; i++) {
-      ArrayList outlinks= new ArrayList();
-      if (i == SKIP) {
-        conf.setBoolean("parser.html.form.use_action", false);
-        utils.setConf(conf);
-      } else {
-        conf.setBoolean("parser.html.form.use_action", true);
-        utils.setConf(conf);
-      }
-      utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
-      Outlink[] outlinkArr= new Outlink[outlinks.size()];
-      outlinkArr= (Outlink[]) outlinks.toArray(outlinkArr);
-      compareOutlinks(answerOutlinks[i], outlinkArr);
-    }
-  }
-
-  private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
-    for (int i= 0; i < o.length; i++) {
-      sb.append(o[i].toString());
-      sb.append(System.getProperty("line.separator"));
-    }
-  }
-
-  private static final String outlinksString(Outlink[] o) {
-    StringBuffer sb= new StringBuffer();
-    appendOutlinks(sb, o);
-    return sb.toString();
-  }
-
-  private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
-    if (o1.length != o2.length) {
-      assertTrue("got wrong number of outlinks (expecting " + o1.length 
-                 + ", got " + o2.length + ")" 
-                 + System.getProperty("line.separator") 
-                 + "answer: " + System.getProperty("line.separator") 
-                 + outlinksString(o1) 
-                 + System.getProperty("line.separator") 
-                 + "got: " + System.getProperty("line.separator") 
-                 + outlinksString(o2)
-                 + System.getProperty("line.separator"),
-                 false
-        );
-    }
-
-    for (int i= 0; i < o1.length; i++) {
-      if (!o1[i].equals(o2[i])) {
-        assertTrue("got wrong outlinks at position " + i
-                   + System.getProperty("line.separator") 
-                   + "answer: " + System.getProperty("line.separator") 
-                   + o1[i].toString()
-                   + System.getProperty("line.separator") 
-                   + "got: " + System.getProperty("line.separator") 
-                   + o2[i].toString(),
-                   false
-          );
-        
-      }
-    }
-  }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import junit.framework.TestCase;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+import java.io.ByteArrayInputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.StringTokenizer;
+
+import org.cyberneko.html.parsers.*;
+import org.xml.sax.*;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+
+/** 
+ * Unit tests for DOMContentUtils.
+ */
+public class TestDOMContentUtils extends TestCase {
+
+  private static final String[] testPages= { 
+    new String("<html><head><title> title </title><script> script </script>"
+               + "</head><body> body <a href=\"http://www.nutch.org\">"
+               + " anchor </a><!--comment-->"
+               + "</body></html>"),
+    new String("<html><head><title> title </title><script> script </script>"
+               + "</head><body> body <a href=\"/\">"
+               + " home </a><!--comment-->"
+               + "<style> style </style>"
+               + " <a href=\"bot.html\">"
+               + " bots </a>"
+               + "</body></html>"),
+    new String("<html><head><title> </title>"
+               + "</head><body> "
+               + "<a href=\"/\"> separate this "
+               + "<a href=\"ok\"> from this"
+               + "</a></a>"
+               + "</body></html>"),
+    // this one relies on certain neko fixup behavior, possibly
+    // distributing the anchors into the LI's-but not the other
+    // anchors (outside of them, instead)!  So you get a tree that
+    // looks like:
+    // ... <li> <a href=/> home </a> </li>
+    //     <li> <a href=/> <a href="1"> 1 </a> </a> </li>
+    //     <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
+    new String("<html><head><title> my title </title>"
+               + "</head><body> body "
+               + "<ul>"
+               + "<li> <a href=\"/\"> home"
+               + "<li> <a href=\"1\"> 1"
+               + "<li> <a href=\"2\"> 2"
+               + "</ul>"
+               + "</body></html>"),
+    // test frameset link extraction. The invalid frame in the middle will be
+    // fixed to a third standalone frame.
+    new String("<html><head><title> my title </title>"
+               + "</head><frameset rows=\"20,*\"> "
+               + "<frame src=\"top.html\">"
+               + "</frame>"
+               + "<frameset cols=\"20,*\">"
+               + "<frame src=\"left.html\">"
+               + "<frame src=\"invalid.html\"/>"
+               + "</frame>"
+               + "<frame src=\"right.html\">"
+               + "</frame>"
+               + "</frameset>"
+               + "</frameset>"
+               + "</body></html>"),
+    // test <area> and <iframe> link extraction + url normalization
+    new String("<html><head><title> my title </title>"
+               + "</head><body>"
+               + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
+			   + "<map name=\"green\">"
+			   + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
+			   + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
+			   + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
+			   + "</map>"
+               + "<a name=\"bottom\"/><h1> the bottom </h1> "
+               + "<iframe src=\"../docs/index.html\"/>"
+               + "</body></html>"),
+    // test whitespace processing for plain text extraction
+    new String("<html><head>\n <title> my\t\n  title\r\n </title>\n"
+               + " </head>\n"
+               + " <body>\n"
+               + "    <h1> Whitespace\ttest  </h1> \n"
+               + "\t<a href=\"../index.html\">\n  \twhitespace  test\r\n\t</a>  \t\n"
+               + "    <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
+               + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
+               + "    This\t<b>is a</b> break -&gt;<br>and the line after<i> break</i>.<br>\n"
+               + "<table>"
+               + "    <tr><td>one</td><td>two</td><td>three</td></tr>\n"
+               + "    <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
+               + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
+               + "</table>put some text here<Br>and there."
+               + "<h2>End\tthis\rmadness\n!</h2>\r\n"
+               + "         .        .        .         ."
+               + "</body>  </html>"),
+
+    // test that <a rel=nofollow> links are not returned
+    new String("<html><head></head><body>"
+               + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
+               + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
+               + "</body></html>"),
+    // test that POST form actions are skipped
+    new String("<html><head></head><body>"
+            + "<form method='POST' action='/search.jsp'><input type=text>"
+            + "<input type=submit><p>test1</p></form>"
+            + "<form method='GET' action='/dummy.jsp'><input type=text>"
+            + "<input type=submit><p>test2</p></form></body></html>"),
+    // test that all form actions are skipped
+    new String("<html><head></head><body>"
+            + "<form method='POST' action='/search.jsp'><input type=text>"
+            + "<input type=submit><p>test1</p></form>"
+            + "<form method='GET' action='/dummy.jsp'><input type=text>"
+            + "<input type=submit><p>test2</p></form></body></html>"),
+    new String("<html><head><title> title </title>"
+      + "</head><body>"
+      + "<a href=\";x\">anchor1</a>"
+      + "<a href=\"g;x\">anchor2</a>"
+      + "<a href=\"g;x?y#s\">anchor3</a>"
+      + "</body></html>"),  
+    new String("<html><head><title> title </title>"
+        + "</head><body>"
+        + "<a href=\"g\">anchor1</a>"
+        + "<a href=\"g?y#s\">anchor2</a>"
+        + "<a href=\"?y=1\">anchor3</a>"
+        + "<a href=\"?y=1#s\">anchor4</a>"
+        + "<a href=\"?y=1;somethingelse\">anchor5</a>"
+        + "</body></html>"), 
+  };
+  
+  private static int SKIP = 9;
+
+  private static String[] testBaseHrefs= {
+    "http://www.nutch.org",     
+    "http://www.nutch.org/docs/foo.html",     
+    "http://www.nutch.org/docs/",     
+    "http://www.nutch.org/docs/",
+    "http://www.nutch.org/frames/",     
+    "http://www.nutch.org/maps/",
+    "http://www.nutch.org/whitespace/",
+    "http://www.nutch.org//",
+    "http://www.nutch.org/",
+    "http://www.nutch.org/",
+    "http://www.nutch.org/",
+    "http://www.nutch.org/;something"
+  };
+    
+  private static final DocumentFragment testDOMs[]=
+    new DocumentFragment[testPages.length];
+
+  private static URL[] testBaseHrefURLs= 
+    new URL[testPages.length];
+
+
+  private static final String[] answerText= {
+    "title body anchor",
+    "title body home bots",
+    "separate this from this",
+    "my title body home 1 2",
+    "my title",
+    "my title the bottom",
+    "my title Whitespace test whitespace test "
+        + "This is a whitespace test . Newlines should appear as space too. "
+        + "Tabs are spaces too. This is a break -> and the line after break . "
+        + "one two three space here space there no space "
+        + "one two two three three four put some text here and there. "
+        + "End this madness ! . . . .",
+    "ignore ignore",
+    "test1 test2",
+    "test1 test2",
+    "title anchor1 anchor2 anchor3",
+    "title anchor1 anchor2 anchor3 anchor4 anchor5"
+  };
+
+  private static final String[] answerTitle= {
+    "title",
+    "title",
+    "",
+    "my title",
+    "my title",
+    "my title",
+    "my title",
+    "",
+    "",
+    "",
+    "title",
+    "title"
+  };
+
+  // note: should be in page-order
+  private static Outlink[][] answerOutlinks;
+  
+  private static Configuration conf;
+  private static DOMContentUtils utils = null;
+  
+  public TestDOMContentUtils(String name) { 
+    super(name); 
+  }
+
+  private static void setup() {
+    conf = NutchConfiguration.create();
+    conf.setBoolean("parser.html.form.use_action", true);
+    utils = new DOMContentUtils(conf);
+    DOMFragmentParser parser= new DOMFragmentParser();
+    for (int i= 0; i < testPages.length; i++) {
+        DocumentFragment node= 
+          new HTMLDocumentImpl().createDocumentFragment();
+        try {
+          parser.parse(
+            new InputSource( 
+              new ByteArrayInputStream(testPages[i].getBytes()) ),
+            node);
+          testBaseHrefURLs[i]= new URL(testBaseHrefs[i]);
+        } catch (Exception e) {
+          assertTrue("caught exception: " + e, false);
+        } 
+      testDOMs[i]= node;
+    }
+    try {
+     answerOutlinks = new Outlink[][]{ 
+         {
+           new Outlink("http://www.nutch.org", "anchor", conf),
+         },
+         {
+           new Outlink("http://www.nutch.org/", "home", conf),
+           new Outlink("http://www.nutch.org/docs/bot.html", "bots", conf),
+         },
+         {
+           new Outlink("http://www.nutch.org/", "separate this", conf),
+           new Outlink("http://www.nutch.org/docs/ok", "from this", conf),
+         },
+         {
+           new Outlink("http://www.nutch.org/", "home", conf),
+           new Outlink("http://www.nutch.org/docs/1", "1", conf),
+           new Outlink("http://www.nutch.org/docs/2", "2", conf),
+         },
+         {
+           new Outlink("http://www.nutch.org/frames/top.html", "", conf),
+           new Outlink("http://www.nutch.org/frames/left.html", "", conf),
+           new Outlink("http://www.nutch.org/frames/invalid.html", "", conf),
+           new Outlink("http://www.nutch.org/frames/right.html", "", conf),
+         },
+         {
+           new Outlink("http://www.nutch.org/maps/logo.gif", "", conf),
+           new Outlink("http://www.nutch.org/index.html", "", conf),
+           new Outlink("http://www.nutch.org/maps/#bottom", "", conf),
+           new Outlink("http://www.nutch.org/bot.html", "", conf),
+           new Outlink("http://www.nutch.org/docs/index.html", "", conf),
+         },
+         {
+             new Outlink("http://www.nutch.org/index.html", "whitespace test", conf),
+         },
+         {
+         },
+         {
+           new Outlink("http://www.nutch.org/dummy.jsp", "test2", conf),
+         },
+         {
+         },
+         {
+           new Outlink("http://www.nutch.org/;x", "anchor1", conf),
+           new Outlink("http://www.nutch.org/g;x", "anchor2", conf),
+           new Outlink("http://www.nutch.org/g;x?y#s", "anchor3", conf)
+         },
+         {
+           new Outlink("http://www.nutch.org/g;something", "anchor1", conf),
+           new Outlink("http://www.nutch.org/g;something?y#s", "anchor2", conf),
+           new Outlink("http://www.nutch.org/;something?y=1", "anchor3", conf),
+           new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4", conf),
+           new Outlink("http://www.nutch.org/?y=1;somethingelse", "anchor5", conf)
+         }
+      };
+   
+    } catch (MalformedURLException e) {
+        
+    }
+  }
+
+  private static boolean equalsIgnoreWhitespace(String s1, String s2) {
+    StringTokenizer st1= new StringTokenizer(s1);
+    StringTokenizer st2= new StringTokenizer(s2);
+
+    while (st1.hasMoreTokens()) {
+      if (!st2.hasMoreTokens()) 
+        return false;
+      if ( ! st1.nextToken().equals(st2.nextToken()) )
+        return false;
+    }
+    if (st2.hasMoreTokens()) 
+      return false;
+    return true;
+  }
+
+  public void testGetText() {
+    if (testDOMs[0] == null) 
+      setup();
+    for (int i= 0; i < testPages.length; i++) {
+      StringBuffer sb= new StringBuffer();
+      utils.getText(sb, testDOMs[i]);
+      String text= sb.toString();
+      assertTrue("expecting text: " + answerText[i] 
+                 + System.getProperty("line.separator") 
+                 + System.getProperty("line.separator") 
+                 + "got text: "+ text, 
+                 equalsIgnoreWhitespace(answerText[i], text));
+    }
+  }
+
+  public void testGetTitle() {
+    if (testDOMs[0] == null) 
+      setup();
+    for (int i= 0; i < testPages.length; i++) {
+      StringBuffer sb= new StringBuffer();
+      utils.getTitle(sb, testDOMs[i]);
+      String text= sb.toString();
+      assertTrue("expecting text: " + answerText[i] 
+                 + System.getProperty("line.separator") 
+                 + System.getProperty("line.separator") 
+                 + "got text: "+ text, 
+                 equalsIgnoreWhitespace(answerTitle[i], text));
+    }
+  }
+
+  public void testGetOutlinks() {
+    if (testDOMs[0] == null) 
+      setup();
+    for (int i= 0; i < testPages.length; i++) {
+      ArrayList outlinks= new ArrayList();
+      if (i == SKIP) {
+        conf.setBoolean("parser.html.form.use_action", false);
+        utils.setConf(conf);
+      } else {
+        conf.setBoolean("parser.html.form.use_action", true);
+        utils.setConf(conf);
+      }
+      utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
+      Outlink[] outlinkArr= new Outlink[outlinks.size()];
+      outlinkArr= (Outlink[]) outlinks.toArray(outlinkArr);
+      compareOutlinks(answerOutlinks[i], outlinkArr);
+    }
+  }
+
+  private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
+    for (int i= 0; i < o.length; i++) {
+      sb.append(o[i].toString());
+      sb.append(System.getProperty("line.separator"));
+    }
+  }
+
+  private static final String outlinksString(Outlink[] o) {
+    StringBuffer sb= new StringBuffer();
+    appendOutlinks(sb, o);
+    return sb.toString();
+  }
+
+  private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
+    if (o1.length != o2.length) {
+      assertTrue("got wrong number of outlinks (expecting " + o1.length 
+                 + ", got " + o2.length + ")" 
+                 + System.getProperty("line.separator") 
+                 + "answer: " + System.getProperty("line.separator") 
+                 + outlinksString(o1) 
+                 + System.getProperty("line.separator") 
+                 + "got: " + System.getProperty("line.separator") 
+                 + outlinksString(o2)
+                 + System.getProperty("line.separator"),
+                 false
+        );
+    }
+
+    for (int i= 0; i < o1.length; i++) {
+      if (!o1[i].equals(o2[i])) {
+        assertTrue("got wrong outlinks at position " + i
+                   + System.getProperty("line.separator") 
+                   + "answer: " + System.getProperty("line.separator") 
+                   + o1[i].toString()
+                   + System.getProperty("line.separator") 
+                   + "got: " + System.getProperty("line.separator") 
+                   + o2[i].toString(),
+                   false
+          );
+        
+      }
+    }
+  }
+}