You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ku...@apache.org on 2007/03/10 04:55:24 UTC
svn commit: r516648 - in /lucene/nutch/trunk/src/plugin/parse-html/src:
java/org/apache/nutch/parse/html/DOMContentUtils.java
test/org/apache/nutch/parse/html/TestDOMContentUtils.java
Author: kubes
Date: Fri Mar 9 19:55:23 2007
New Revision: 516648
URL: http://svn.apache.org/viewvc?view=rev&rev=516648
Log:
NUTCH-436 resolved. Fixed behavior of urls with param
(i.e. ;xxxx) information. My problem with EOL characters on
commit should be resolved.
Modified:
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?view=diff&rev=516648&r1=516647&r2=516648
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Fri Mar 9 19:55:23 2007
@@ -1,353 +1,400 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.html;
-
-import java.net.URL;
-import java.net.MalformedURLException;
-import java.util.ArrayList;
-import java.util.HashMap;
-
-import org.apache.nutch.parse.Outlink;
-import org.apache.hadoop.conf.Configuration;
-
-import org.w3c.dom.*;
-
-/**
- * A collection of methods for extracting content from DOM trees.
- *
- * This class holds a few utility methods for pulling content out of
- * DOM nodes, such as getOutlinks, getText, etc.
- *
- */
-public class DOMContentUtils {
-
- public static class LinkParams {
- public String elName;
- public String attrName;
- public int childLen;
-
- public LinkParams(String elName, String attrName, int childLen) {
- this.elName = elName;
- this.attrName = attrName;
- this.childLen = childLen;
- }
-
- public String toString() {
- return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]";
- }
- }
-
- private HashMap linkParams = new HashMap();
- private Configuration conf;
-
-
- public DOMContentUtils(Configuration conf) {
- setConf(conf);
- }
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- linkParams.clear();
- linkParams.put("a", new LinkParams("a", "href", 1));
- linkParams.put("area", new LinkParams("area", "href", 0));
- if (conf.getBoolean("parser.html.form.use_action", false)) {
- linkParams.put("form", new LinkParams("form", "action", 1));
- }
- linkParams.put("frame", new LinkParams("frame", "src", 0));
- linkParams.put("iframe", new LinkParams("iframe", "src", 0));
- linkParams.put("script", new LinkParams("script", "src", 0));
- linkParams.put("link", new LinkParams("link", "href", 0));
- linkParams.put("img", new LinkParams("img", "src", 0));
- }
-
- /**
- * This method takes a {@link StringBuffer} and a DOM {@link Node},
- * and will append all the content text found beneath the DOM node to
- * the <code>StringBuffer</code>.
- *
- * <p>
- *
- * If <code>abortOnNestedAnchors</code> is true, DOM traversal will
- * be aborted and the <code>StringBuffer</code> will not contain
- * any text encountered after a nested anchor is found.
- *
- * <p>
- *
- * @return true if nested anchors were found
- */
- public boolean getText(StringBuffer sb, Node node,
- boolean abortOnNestedAnchors) {
- if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
- return true;
- }
- return false;
- }
-
-
- /**
- * This is a convinience method, equivalent to {@link
- * #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
- *
- */
- public void getText(StringBuffer sb, Node node) {
- getText(sb, node, false);
- }
-
- // returns true if abortOnNestedAnchors is true and we find nested
- // anchors
- private boolean getTextHelper(StringBuffer sb, Node node,
- boolean abortOnNestedAnchors,
- int anchorDepth) {
- if ("script".equalsIgnoreCase(node.getNodeName())) {
- return false;
- }
- if ("style".equalsIgnoreCase(node.getNodeName())) {
- return false;
- }
- if (abortOnNestedAnchors && "a".equalsIgnoreCase(node.getNodeName())) {
- anchorDepth++;
- if (anchorDepth > 1)
- return true;
- }
- if (node.getNodeType() == Node.COMMENT_NODE) {
- return false;
- }
- if (node.getNodeType() == Node.TEXT_NODE) {
- // cleanup and trim the value
- String text = node.getNodeValue();
- text = text.replaceAll("\\s+", " ");
- text = text.trim();
- if (text.length() > 0) {
- if (sb.length() > 0) sb.append(' ');
- sb.append(text);
- }
- }
- boolean abort = false;
- NodeList children = node.getChildNodes();
- if (children != null) {
- int len = children.getLength();
- for (int i = 0; i < len; i++) {
- if (getTextHelper(sb, children.item(i),
- abortOnNestedAnchors, anchorDepth)) {
- abort = true;
- break;
- }
- }
- }
- return abort;
- }
-
- /**
- * This method takes a {@link StringBuffer} and a DOM {@link Node},
- * and will append the content text found beneath the first
- * <code>title</code> node to the <code>StringBuffer</code>.
- *
- * @return true if a title node was found, false otherwise
- */
- public boolean getTitle(StringBuffer sb, Node node) {
- if ("body".equalsIgnoreCase(node.getNodeName())) // stop after HEAD
- return false;
-
- if (node.getNodeType() == Node.ELEMENT_NODE) {
- if ("title".equalsIgnoreCase(node.getNodeName())) {
- getText(sb, node);
- return true;
- }
- }
- NodeList children = node.getChildNodes();
- if (children != null) {
- int len = children.getLength();
- for (int i = 0; i < len; i++) {
- if (getTitle(sb, children.item(i))) {
- return true;
- }
- }
- }
- return false;
- }
-
- /** If Node contains a BASE tag then it's HREF is returned. */
- public URL getBase(Node node) {
-
- // is this node a BASE tag?
- if (node.getNodeType() == Node.ELEMENT_NODE) {
-
- if ("body".equalsIgnoreCase(node.getNodeName())) // stop after HEAD
- return null;
-
-
- if ("base".equalsIgnoreCase(node.getNodeName())) {
- NamedNodeMap attrs = node.getAttributes();
- for (int i= 0; i < attrs.getLength(); i++ ) {
- Node attr = attrs.item(i);
- if ("href".equalsIgnoreCase(attr.getNodeName())) {
- try {
- return new URL(attr.getNodeValue());
- } catch (MalformedURLException e) {}
- }
- }
- }
- }
-
- // does it contain a base tag?
- NodeList children = node.getChildNodes();
- if (children != null) {
- int len = children.getLength();
- for (int i = 0; i < len; i++) {
- URL base = getBase(children.item(i));
- if (base != null)
- return base;
- }
- }
-
- // no.
- return null;
- }
-
-
- private boolean hasOnlyWhiteSpace(Node node) {
- String val= node.getNodeValue();
- for (int i= 0; i < val.length(); i++) {
- if (!Character.isWhitespace(val.charAt(i)))
- return false;
- }
- return true;
- }
-
- // this only covers a few cases of empty links that are symptomatic
- // of nekohtml's DOM-fixup process...
- private boolean shouldThrowAwayLink(Node node, NodeList children,
- int childLen, LinkParams params) {
- if (childLen == 0) {
- // this has no inner structure
- if (params.childLen == 0) return false;
- else return true;
- } else if ((childLen == 1)
- && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
- && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
- // single nested link
- return true;
-
- } else if (childLen == 2) {
-
- Node c0= children.item(0);
- Node c1= children.item(1);
-
- if ((c0.getNodeType() == Node.ELEMENT_NODE)
- && (params.elName.equalsIgnoreCase(c0.getNodeName()))
- && (c1.getNodeType() == Node.TEXT_NODE)
- && hasOnlyWhiteSpace(c1) ) {
- // single link followed by whitespace node
- return true;
- }
-
- if ((c1.getNodeType() == Node.ELEMENT_NODE)
- && (params.elName.equalsIgnoreCase(c1.getNodeName()))
- && (c0.getNodeType() == Node.TEXT_NODE)
- && hasOnlyWhiteSpace(c0) ) {
- // whitespace node followed by single link
- return true;
- }
-
- } else if (childLen == 3) {
- Node c0= children.item(0);
- Node c1= children.item(1);
- Node c2= children.item(2);
-
- if ((c1.getNodeType() == Node.ELEMENT_NODE)
- && (params.elName.equalsIgnoreCase(c1.getNodeName()))
- && (c0.getNodeType() == Node.TEXT_NODE)
- && (c2.getNodeType() == Node.TEXT_NODE)
- && hasOnlyWhiteSpace(c0)
- && hasOnlyWhiteSpace(c2) ) {
- // single link surrounded by whitespace nodes
- return true;
- }
- }
-
- return false;
- }
-
- /**
- * This method finds all anchors below the supplied DOM
- * <code>node</code>, and creates appropriate {@link Outlink}
- * records for each (relative to the supplied <code>base</code>
- * URL), and adds them to the <code>outlinks</code> {@link
- * ArrayList}.
- *
- * <p>
- *
- * Links without inner structure (tags, text, etc) are discarded, as
- * are links which contain only single nested links and empty text
- * nodes (this is a common DOM-fixup artifact, at least with
- * nekohtml).
- */
- public void getOutlinks(URL base, ArrayList outlinks,
- Node node) {
-
- NodeList children = node.getChildNodes();
- int childLen= 0;
- if (children != null)
- childLen= children.getLength();
-
- if (node.getNodeType() == Node.ELEMENT_NODE) {
- String nodeName = node.getNodeName().toLowerCase();
- LinkParams params = (LinkParams)linkParams.get(nodeName);
- if (params != null) {
- if (!shouldThrowAwayLink(node, children, childLen, params)) {
-
- StringBuffer linkText = new StringBuffer();
- getText(linkText, node, true);
-
- NamedNodeMap attrs = node.getAttributes();
- String target = null;
- boolean noFollow = false;
- boolean post = false;
- for (int i= 0; i < attrs.getLength(); i++ ) {
- Node attr = attrs.item(i);
- String attrName = attr.getNodeName();
- if (params.attrName.equalsIgnoreCase(attrName)) {
- target = attr.getNodeValue();
- } else if ("rel".equalsIgnoreCase(attrName) &&
- "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
- noFollow = true;
- } else if ("method".equalsIgnoreCase(attrName) &&
- "post".equalsIgnoreCase(attr.getNodeValue())) {
- post = true;
- }
- }
- if (target != null && !noFollow && !post)
- try {
- URL url = new URL(base, target);
- outlinks.add(new Outlink(url.toString(),
- linkText.toString().trim(), conf));
- } catch (MalformedURLException e) {
- // don't care
- }
- }
- // this should not have any children, skip them
- if (params.childLen == 0) return;
- }
- }
- for ( int i = 0; i < childLen; i++ ) {
- getOutlinks(base, outlinks, children.item(i));
- }
- }
-
-}
-
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import java.net.URL;
+import java.net.MalformedURLException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.hadoop.conf.Configuration;
+
+import org.w3c.dom.*;
+
+/**
+ * A collection of methods for extracting content from DOM trees.
+ *
+ * This class holds a few utility methods for pulling content out of
+ * DOM nodes, such as getOutlinks, getText, etc.
+ *
+ */
+public class DOMContentUtils {
+
+ public static class LinkParams {
+ public String elName;
+ public String attrName;
+ public int childLen;
+
+ public LinkParams(String elName, String attrName, int childLen) {
+ this.elName = elName;
+ this.attrName = attrName;
+ this.childLen = childLen;
+ }
+
+ public String toString() {
+ return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]";
+ }
+ }
+
+ private HashMap linkParams = new HashMap();
+ private Configuration conf;
+
+
+ public DOMContentUtils(Configuration conf) {
+ setConf(conf);
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ linkParams.clear();
+ linkParams.put("a", new LinkParams("a", "href", 1));
+ linkParams.put("area", new LinkParams("area", "href", 0));
+ if (conf.getBoolean("parser.html.form.use_action", false)) {
+ linkParams.put("form", new LinkParams("form", "action", 1));
+ }
+ linkParams.put("frame", new LinkParams("frame", "src", 0));
+ linkParams.put("iframe", new LinkParams("iframe", "src", 0));
+ linkParams.put("script", new LinkParams("script", "src", 0));
+ linkParams.put("link", new LinkParams("link", "href", 0));
+ linkParams.put("img", new LinkParams("img", "src", 0));
+ }
+
+ /**
+ * This method takes a {@link StringBuffer} and a DOM {@link Node},
+ * and will append all the content text found beneath the DOM node to
+ * the <code>StringBuffer</code>.
+ *
+ * <p>
+ *
+ * If <code>abortOnNestedAnchors</code> is true, DOM traversal will
+ * be aborted and the <code>StringBuffer</code> will not contain
+ * any text encountered after a nested anchor is found.
+ *
+ * <p>
+ *
+ * @return true if nested anchors were found
+ */
+ public boolean getText(StringBuffer sb, Node node,
+ boolean abortOnNestedAnchors) {
+ if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
+ return true;
+ }
+ return false;
+ }
+
+
+ /**
+ * This is a convinience method, equivalent to {@link
+ * #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
+ *
+ */
+ public void getText(StringBuffer sb, Node node) {
+ getText(sb, node, false);
+ }
+
+ // returns true if abortOnNestedAnchors is true and we find nested
+ // anchors
+ private boolean getTextHelper(StringBuffer sb, Node node,
+ boolean abortOnNestedAnchors,
+ int anchorDepth) {
+ if ("script".equalsIgnoreCase(node.getNodeName())) {
+ return false;
+ }
+ if ("style".equalsIgnoreCase(node.getNodeName())) {
+ return false;
+ }
+ if (abortOnNestedAnchors && "a".equalsIgnoreCase(node.getNodeName())) {
+ anchorDepth++;
+ if (anchorDepth > 1)
+ return true;
+ }
+ if (node.getNodeType() == Node.COMMENT_NODE) {
+ return false;
+ }
+ if (node.getNodeType() == Node.TEXT_NODE) {
+ // cleanup and trim the value
+ String text = node.getNodeValue();
+ text = text.replaceAll("\\s+", " ");
+ text = text.trim();
+ if (text.length() > 0) {
+ if (sb.length() > 0) sb.append(' ');
+ sb.append(text);
+ }
+ }
+ boolean abort = false;
+ NodeList children = node.getChildNodes();
+ if (children != null) {
+ int len = children.getLength();
+ for (int i = 0; i < len; i++) {
+ if (getTextHelper(sb, children.item(i),
+ abortOnNestedAnchors, anchorDepth)) {
+ abort = true;
+ break;
+ }
+ }
+ }
+ return abort;
+ }
+
+ /**
+ * This method takes a {@link StringBuffer} and a DOM {@link Node},
+ * and will append the content text found beneath the first
+ * <code>title</code> node to the <code>StringBuffer</code>.
+ *
+ * @return true if a title node was found, false otherwise
+ */
+ public boolean getTitle(StringBuffer sb, Node node) {
+ if ("body".equalsIgnoreCase(node.getNodeName())) // stop after HEAD
+ return false;
+
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+ if ("title".equalsIgnoreCase(node.getNodeName())) {
+ getText(sb, node);
+ return true;
+ }
+ }
+ NodeList children = node.getChildNodes();
+ if (children != null) {
+ int len = children.getLength();
+ for (int i = 0; i < len; i++) {
+ if (getTitle(sb, children.item(i))) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ /** If Node contains a BASE tag then it's HREF is returned. */
+ public URL getBase(Node node) {
+
+ // is this node a BASE tag?
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+
+ if ("body".equalsIgnoreCase(node.getNodeName())) // stop after HEAD
+ return null;
+
+
+ if ("base".equalsIgnoreCase(node.getNodeName())) {
+ NamedNodeMap attrs = node.getAttributes();
+ for (int i= 0; i < attrs.getLength(); i++ ) {
+ Node attr = attrs.item(i);
+ if ("href".equalsIgnoreCase(attr.getNodeName())) {
+ try {
+ return new URL(attr.getNodeValue());
+ } catch (MalformedURLException e) {}
+ }
+ }
+ }
+ }
+
+ // does it contain a base tag?
+ NodeList children = node.getChildNodes();
+ if (children != null) {
+ int len = children.getLength();
+ for (int i = 0; i < len; i++) {
+ URL base = getBase(children.item(i));
+ if (base != null)
+ return base;
+ }
+ }
+
+ // no.
+ return null;
+ }
+
+
+ private boolean hasOnlyWhiteSpace(Node node) {
+ String val= node.getNodeValue();
+ for (int i= 0; i < val.length(); i++) {
+ if (!Character.isWhitespace(val.charAt(i)))
+ return false;
+ }
+ return true;
+ }
+
+ // this only covers a few cases of empty links that are symptomatic
+ // of nekohtml's DOM-fixup process...
+ private boolean shouldThrowAwayLink(Node node, NodeList children,
+ int childLen, LinkParams params) {
+ if (childLen == 0) {
+ // this has no inner structure
+ if (params.childLen == 0) return false;
+ else return true;
+ } else if ((childLen == 1)
+ && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
+ && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
+ // single nested link
+ return true;
+
+ } else if (childLen == 2) {
+
+ Node c0= children.item(0);
+ Node c1= children.item(1);
+
+ if ((c0.getNodeType() == Node.ELEMENT_NODE)
+ && (params.elName.equalsIgnoreCase(c0.getNodeName()))
+ && (c1.getNodeType() == Node.TEXT_NODE)
+ && hasOnlyWhiteSpace(c1) ) {
+ // single link followed by whitespace node
+ return true;
+ }
+
+ if ((c1.getNodeType() == Node.ELEMENT_NODE)
+ && (params.elName.equalsIgnoreCase(c1.getNodeName()))
+ && (c0.getNodeType() == Node.TEXT_NODE)
+ && hasOnlyWhiteSpace(c0) ) {
+ // whitespace node followed by single link
+ return true;
+ }
+
+ } else if (childLen == 3) {
+ Node c0= children.item(0);
+ Node c1= children.item(1);
+ Node c2= children.item(2);
+
+ if ((c1.getNodeType() == Node.ELEMENT_NODE)
+ && (params.elName.equalsIgnoreCase(c1.getNodeName()))
+ && (c0.getNodeType() == Node.TEXT_NODE)
+ && (c2.getNodeType() == Node.TEXT_NODE)
+ && hasOnlyWhiteSpace(c0)
+ && hasOnlyWhiteSpace(c2) ) {
+ // single link surrounded by whitespace nodes
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Handles cases where the url param information is encoded into the base
+ * url as opposed to the target.
+ * <p>
+ * If the taget contains params (i.e. ';xxxx') information then the target
+ * params information is assumed to be correct and any base params information
+ * is ignored. If the base contains params information but the tareget does
+ * not, then the params information is moved to the target allowing it to be
+ * correctly determined by the java.net.URL class.
+ *
+ * @param base The base URL.
+ * @param target The target path from the base URL.
+ *
+ * @return URL A URL with the params information correctly encoded.
+ *
+ * @throws MalformedURLException If the url is not a well formed URL.
+ */
+ private URL fixEmbeddedParams(URL base, String target)
+ throws MalformedURLException{
+
+ // the target contains params information or the base doesn't then no
+ // conversion necessary, return regular URL
+ if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
+ return new URL(base, target);
+ }
+
+ // get the base url and it params information
+ String baseURL = base.toString();
+ int startParams = baseURL.indexOf(';');
+ String params = baseURL.substring(startParams);
+
+ // if the target has a query string then put the params information after
+ // any path but before the query string, otherwise just append to the path
+ int startQS = target.indexOf('?');
+ if (startQS >= 0) {
+ target = target.substring(0, startQS) + params +
+ target.substring(startQS);
+ }
+ else {
+ target += params;
+ }
+
+ return new URL(base, target);
+ }
+
+ /**
+ * This method finds all anchors below the supplied DOM
+ * <code>node</code>, and creates appropriate {@link Outlink}
+ * records for each (relative to the supplied <code>base</code>
+ * URL), and adds them to the <code>outlinks</code> {@link
+ * ArrayList}.
+ *
+ * <p>
+ *
+ * Links without inner structure (tags, text, etc) are discarded, as
+ * are links which contain only single nested links and empty text
+ * nodes (this is a common DOM-fixup artifact, at least with
+ * nekohtml).
+ */
+ public void getOutlinks(URL base, ArrayList outlinks,
+ Node node) {
+
+ NodeList children = node.getChildNodes();
+ int childLen= 0;
+ if (children != null)
+ childLen= children.getLength();
+
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+ String nodeName = node.getNodeName().toLowerCase();
+ LinkParams params = (LinkParams)linkParams.get(nodeName);
+ if (params != null) {
+ if (!shouldThrowAwayLink(node, children, childLen, params)) {
+
+ StringBuffer linkText = new StringBuffer();
+ getText(linkText, node, true);
+
+ NamedNodeMap attrs = node.getAttributes();
+ String target = null;
+ boolean noFollow = false;
+ boolean post = false;
+ for (int i= 0; i < attrs.getLength(); i++ ) {
+ Node attr = attrs.item(i);
+ String attrName = attr.getNodeName();
+ if (params.attrName.equalsIgnoreCase(attrName)) {
+ target = attr.getNodeValue();
+ } else if ("rel".equalsIgnoreCase(attrName) &&
+ "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+ noFollow = true;
+ } else if ("method".equalsIgnoreCase(attrName) &&
+ "post".equalsIgnoreCase(attr.getNodeValue())) {
+ post = true;
+ }
+ }
+ if (target != null && !noFollow && !post)
+ try {
+
+ URL url = (base.toString().indexOf(';') > 0) ?
+ fixEmbeddedParams(base, target) : new URL(base, target);
+ outlinks.add(new Outlink(url.toString(),
+ linkText.toString().trim(), conf));
+ } catch (MalformedURLException e) {
+ // don't care
+ }
+ }
+ // this should not have any children, skip them
+ if (params.childLen == 0) return;
+ }
+ }
+ for ( int i = 0; i < childLen; i++ ) {
+ getOutlinks(base, outlinks, children.item(i));
+ }
+ }
+
+}
+
Modified: lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?view=diff&rev=516648&r1=516647&r2=516648
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Fri Mar 9 19:55:23 2007
@@ -1,376 +1,408 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.html;
-
-import junit.framework.TestCase;
-
-import org.apache.nutch.parse.Outlink;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-
-import java.io.ByteArrayInputStream;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.StringTokenizer;
-
-import org.cyberneko.html.parsers.*;
-import org.xml.sax.*;
-import org.w3c.dom.*;
-import org.apache.html.dom.*;
-
-/**
- * Unit tests for DOMContentUtils.
- */
-public class TestDOMContentUtils extends TestCase {
-
- private static final String[] testPages= {
- new String("<html><head><title> title </title><script> script </script>"
- + "</head><body> body <a href=\"http://www.nutch.org\">"
- + " anchor </a><!--comment-->"
- + "</body></html>"),
- new String("<html><head><title> title </title><script> script </script>"
- + "</head><body> body <a href=\"/\">"
- + " home </a><!--comment-->"
- + "<style> style </style>"
- + " <a href=\"bot.html\">"
- + " bots </a>"
- + "</body></html>"),
- new String("<html><head><title> </title>"
- + "</head><body> "
- + "<a href=\"/\"> separate this "
- + "<a href=\"ok\"> from this"
- + "</a></a>"
- + "</body></html>"),
- // this one relies on certain neko fixup behavior, possibly
- // distributing the anchors into the LI's-but not the other
- // anchors (outside of them, instead)! So you get a tree that
- // looks like:
- // ... <li> <a href=/> home </a> </li>
- // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
- // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
- new String("<html><head><title> my title </title>"
- + "</head><body> body "
- + "<ul>"
- + "<li> <a href=\"/\"> home"
- + "<li> <a href=\"1\"> 1"
- + "<li> <a href=\"2\"> 2"
- + "</ul>"
- + "</body></html>"),
- // test frameset link extraction. The invalid frame in the middle will be
- // fixed to a third standalone frame.
- new String("<html><head><title> my title </title>"
- + "</head><frameset rows=\"20,*\"> "
- + "<frame src=\"top.html\">"
- + "</frame>"
- + "<frameset cols=\"20,*\">"
- + "<frame src=\"left.html\">"
- + "<frame src=\"invalid.html\"/>"
- + "</frame>"
- + "<frame src=\"right.html\">"
- + "</frame>"
- + "</frameset>"
- + "</frameset>"
- + "</body></html>"),
- // test <area> and <iframe> link extraction + url normalization
- new String("<html><head><title> my title </title>"
- + "</head><body>"
- + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
- + "<map name=\"green\">"
- + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
- + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
- + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
- + "</map>"
- + "<a name=\"bottom\"/><h1> the bottom </h1> "
- + "<iframe src=\"../docs/index.html\"/>"
- + "</body></html>"),
- // test whitespace processing for plain text extraction
- new String("<html><head>\n <title> my\t\n title\r\n </title>\n"
- + " </head>\n"
- + " <body>\n"
- + " <h1> Whitespace\ttest </h1> \n"
- + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n"
- + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
- + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
- + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n"
- + "<table>"
- + " <tr><td>one</td><td>two</td><td>three</td></tr>\n"
- + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
- + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
- + "</table>put some text here<Br>and there."
- + "<h2>End\tthis\rmadness\n!</h2>\r\n"
- + " . . . ."
- + "</body> </html>"),
-
- // test that <a rel=nofollow> links are not returned
- new String("<html><head></head><body>"
- + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
- + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
- + "</body></html>"),
- // test that POST form actions are skipped
- new String("<html><head></head><body>"
- + "<form method='POST' action='/search.jsp'><input type=text>"
- + "<input type=submit><p>test1</p></form>"
- + "<form method='GET' action='/dummy.jsp'><input type=text>"
- + "<input type=submit><p>test2</p></form></body></html>"),
- // test that all form actions are skipped
- new String("<html><head></head><body>"
- + "<form method='POST' action='/search.jsp'><input type=text>"
- + "<input type=submit><p>test1</p></form>"
- + "<form method='GET' action='/dummy.jsp'><input type=text>"
- + "<input type=submit><p>test2</p></form></body></html>"),
- };
-
- private static int SKIP = 9;
-
- private static String[] testBaseHrefs= {
- "http://www.nutch.org",
- "http://www.nutch.org/docs/foo.html",
- "http://www.nutch.org/docs/",
- "http://www.nutch.org/docs/",
- "http://www.nutch.org/frames/",
- "http://www.nutch.org/maps/",
- "http://www.nutch.org/whitespace/",
- "http://www.nutch.org//",
- "http://www.nutch.org/",
- "http://www.nutch.org/",
- };
-
- private static final DocumentFragment testDOMs[]=
- new DocumentFragment[testPages.length];
-
- private static URL[] testBaseHrefURLs=
- new URL[testPages.length];
-
-
- private static final String[] answerText= {
- "title body anchor",
- "title body home bots",
- "separate this from this",
- "my title body home 1 2",
- "my title",
- "my title the bottom",
- "my title Whitespace test whitespace test "
- + "This is a whitespace test . Newlines should appear as space too. "
- + "Tabs are spaces too. This is a break -> and the line after break . "
- + "one two three space here space there no space "
- + "one two two three three four put some text here and there. "
- + "End this madness ! . . . .",
- "ignore ignore",
- "test1 test2",
- "test1 test2"
- };
-
- private static final String[] answerTitle= {
- "title",
- "title",
- "",
- "my title",
- "my title",
- "my title",
- "my title",
- "",
- "",
- ""
- };
-
- // note: should be in page-order
- private static Outlink[][] answerOutlinks;
-
- private static Configuration conf;
- private static DOMContentUtils utils = null;
-
- public TestDOMContentUtils(String name) {
- super(name);
- }
-
- private static void setup() {
- conf = NutchConfiguration.create();
- conf.setBoolean("parser.html.form.use_action", true);
- utils = new DOMContentUtils(conf);
- DOMFragmentParser parser= new DOMFragmentParser();
- for (int i= 0; i < testPages.length; i++) {
- DocumentFragment node=
- new HTMLDocumentImpl().createDocumentFragment();
- try {
- parser.parse(
- new InputSource(
- new ByteArrayInputStream(testPages[i].getBytes()) ),
- node);
- testBaseHrefURLs[i]= new URL(testBaseHrefs[i]);
- } catch (Exception e) {
- assertTrue("caught exception: " + e, false);
- }
- testDOMs[i]= node;
- }
- try {
- answerOutlinks = new Outlink[][]{
- {
- new Outlink("http://www.nutch.org", "anchor", conf),
- },
- {
- new Outlink("http://www.nutch.org/", "home", conf),
- new Outlink("http://www.nutch.org/docs/bot.html", "bots", conf),
- },
- {
- new Outlink("http://www.nutch.org/", "separate this", conf),
- new Outlink("http://www.nutch.org/docs/ok", "from this", conf),
- },
- {
- new Outlink("http://www.nutch.org/", "home", conf),
- new Outlink("http://www.nutch.org/docs/1", "1", conf),
- new Outlink("http://www.nutch.org/docs/2", "2", conf),
- },
- {
- new Outlink("http://www.nutch.org/frames/top.html", "", conf),
- new Outlink("http://www.nutch.org/frames/left.html", "", conf),
- new Outlink("http://www.nutch.org/frames/invalid.html", "", conf),
- new Outlink("http://www.nutch.org/frames/right.html", "", conf),
- },
- {
- new Outlink("http://www.nutch.org/maps/logo.gif", "", conf),
- new Outlink("http://www.nutch.org/index.html", "", conf),
- new Outlink("http://www.nutch.org/maps/#bottom", "", conf),
- new Outlink("http://www.nutch.org/bot.html", "", conf),
- new Outlink("http://www.nutch.org/docs/index.html", "", conf),
- },
- {
- new Outlink("http://www.nutch.org/index.html", "whitespace test", conf),
- },
- {
- },
- {
- new Outlink("http://www.nutch.org/dummy.jsp", "test2", conf),
- },
- {
- }
- };
-
- } catch (MalformedURLException e) {
-
- }
- }
-
- private static boolean equalsIgnoreWhitespace(String s1, String s2) {
- StringTokenizer st1= new StringTokenizer(s1);
- StringTokenizer st2= new StringTokenizer(s2);
-
- while (st1.hasMoreTokens()) {
- if (!st2.hasMoreTokens())
- return false;
- if ( ! st1.nextToken().equals(st2.nextToken()) )
- return false;
- }
- if (st2.hasMoreTokens())
- return false;
- return true;
- }
-
- public void testGetText() {
- if (testDOMs[0] == null)
- setup();
- for (int i= 0; i < testPages.length; i++) {
- StringBuffer sb= new StringBuffer();
- utils.getText(sb, testDOMs[i]);
- String text= sb.toString();
- assertTrue("expecting text: " + answerText[i]
- + System.getProperty("line.separator")
- + System.getProperty("line.separator")
- + "got text: "+ text,
- equalsIgnoreWhitespace(answerText[i], text));
- }
- }
-
- public void testGetTitle() {
- if (testDOMs[0] == null)
- setup();
- for (int i= 0; i < testPages.length; i++) {
- StringBuffer sb= new StringBuffer();
- utils.getTitle(sb, testDOMs[i]);
- String text= sb.toString();
- assertTrue("expecting text: " + answerText[i]
- + System.getProperty("line.separator")
- + System.getProperty("line.separator")
- + "got text: "+ text,
- equalsIgnoreWhitespace(answerTitle[i], text));
- }
- }
-
- public void testGetOutlinks() {
- if (testDOMs[0] == null)
- setup();
- for (int i= 0; i < testPages.length; i++) {
- ArrayList outlinks= new ArrayList();
- if (i == SKIP) {
- conf.setBoolean("parser.html.form.use_action", false);
- utils.setConf(conf);
- } else {
- conf.setBoolean("parser.html.form.use_action", true);
- utils.setConf(conf);
- }
- utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
- Outlink[] outlinkArr= new Outlink[outlinks.size()];
- outlinkArr= (Outlink[]) outlinks.toArray(outlinkArr);
- compareOutlinks(answerOutlinks[i], outlinkArr);
- }
- }
-
- private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
- for (int i= 0; i < o.length; i++) {
- sb.append(o[i].toString());
- sb.append(System.getProperty("line.separator"));
- }
- }
-
- private static final String outlinksString(Outlink[] o) {
- StringBuffer sb= new StringBuffer();
- appendOutlinks(sb, o);
- return sb.toString();
- }
-
- private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
- if (o1.length != o2.length) {
- assertTrue("got wrong number of outlinks (expecting " + o1.length
- + ", got " + o2.length + ")"
- + System.getProperty("line.separator")
- + "answer: " + System.getProperty("line.separator")
- + outlinksString(o1)
- + System.getProperty("line.separator")
- + "got: " + System.getProperty("line.separator")
- + outlinksString(o2)
- + System.getProperty("line.separator"),
- false
- );
- }
-
- for (int i= 0; i < o1.length; i++) {
- if (!o1[i].equals(o2[i])) {
- assertTrue("got wrong outlinks at position " + i
- + System.getProperty("line.separator")
- + "answer: " + System.getProperty("line.separator")
- + o1[i].toString()
- + System.getProperty("line.separator")
- + "got: " + System.getProperty("line.separator")
- + o2[i].toString(),
- false
- );
-
- }
- }
- }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import junit.framework.TestCase;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+import java.io.ByteArrayInputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.StringTokenizer;
+
+import org.cyberneko.html.parsers.*;
+import org.xml.sax.*;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+
+/**
+ * Unit tests for DOMContentUtils.
+ */
+public class TestDOMContentUtils extends TestCase {
+
+ private static final String[] testPages= {
+ new String("<html><head><title> title </title><script> script </script>"
+ + "</head><body> body <a href=\"http://www.nutch.org\">"
+ + " anchor </a><!--comment-->"
+ + "</body></html>"),
+ new String("<html><head><title> title </title><script> script </script>"
+ + "</head><body> body <a href=\"/\">"
+ + " home </a><!--comment-->"
+ + "<style> style </style>"
+ + " <a href=\"bot.html\">"
+ + " bots </a>"
+ + "</body></html>"),
+ new String("<html><head><title> </title>"
+ + "</head><body> "
+ + "<a href=\"/\"> separate this "
+ + "<a href=\"ok\"> from this"
+ + "</a></a>"
+ + "</body></html>"),
+ // this one relies on certain neko fixup behavior, possibly
+ // distributing the anchors into the LI's-but not the other
+ // anchors (outside of them, instead)! So you get a tree that
+ // looks like:
+ // ... <li> <a href=/> home </a> </li>
+ // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
+ // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
+ new String("<html><head><title> my title </title>"
+ + "</head><body> body "
+ + "<ul>"
+ + "<li> <a href=\"/\"> home"
+ + "<li> <a href=\"1\"> 1"
+ + "<li> <a href=\"2\"> 2"
+ + "</ul>"
+ + "</body></html>"),
+ // test frameset link extraction. The invalid frame in the middle will be
+ // fixed to a third standalone frame.
+ new String("<html><head><title> my title </title>"
+ + "</head><frameset rows=\"20,*\"> "
+ + "<frame src=\"top.html\">"
+ + "</frame>"
+ + "<frameset cols=\"20,*\">"
+ + "<frame src=\"left.html\">"
+ + "<frame src=\"invalid.html\"/>"
+ + "</frame>"
+ + "<frame src=\"right.html\">"
+ + "</frame>"
+ + "</frameset>"
+ + "</frameset>"
+ + "</body></html>"),
+ // test <area> and <iframe> link extraction + url normalization
+ new String("<html><head><title> my title </title>"
+ + "</head><body>"
+ + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
+ + "<map name=\"green\">"
+ + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
+ + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
+ + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
+ + "</map>"
+ + "<a name=\"bottom\"/><h1> the bottom </h1> "
+ + "<iframe src=\"../docs/index.html\"/>"
+ + "</body></html>"),
+ // test whitespace processing for plain text extraction
+ new String("<html><head>\n <title> my\t\n title\r\n </title>\n"
+ + " </head>\n"
+ + " <body>\n"
+ + " <h1> Whitespace\ttest </h1> \n"
+ + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n"
+ + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
+ + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
+ + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n"
+ + "<table>"
+ + " <tr><td>one</td><td>two</td><td>three</td></tr>\n"
+ + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
+ + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
+ + "</table>put some text here<Br>and there."
+ + "<h2>End\tthis\rmadness\n!</h2>\r\n"
+ + " . . . ."
+ + "</body> </html>"),
+
+ // test that <a rel=nofollow> links are not returned
+ new String("<html><head></head><body>"
+ + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
+ + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
+ + "</body></html>"),
+ // test that POST form actions are skipped
+ new String("<html><head></head><body>"
+ + "<form method='POST' action='/search.jsp'><input type=text>"
+ + "<input type=submit><p>test1</p></form>"
+ + "<form method='GET' action='/dummy.jsp'><input type=text>"
+ + "<input type=submit><p>test2</p></form></body></html>"),
+ // test that all form actions are skipped
+ new String("<html><head></head><body>"
+ + "<form method='POST' action='/search.jsp'><input type=text>"
+ + "<input type=submit><p>test1</p></form>"
+ + "<form method='GET' action='/dummy.jsp'><input type=text>"
+ + "<input type=submit><p>test2</p></form></body></html>"),
+ new String("<html><head><title> title </title>"
+ + "</head><body>"
+ + "<a href=\";x\">anchor1</a>"
+ + "<a href=\"g;x\">anchor2</a>"
+ + "<a href=\"g;x?y#s\">anchor3</a>"
+ + "</body></html>"),
+ new String("<html><head><title> title </title>"
+ + "</head><body>"
+ + "<a href=\"g\">anchor1</a>"
+ + "<a href=\"g?y#s\">anchor2</a>"
+ + "<a href=\"?y=1\">anchor3</a>"
+ + "<a href=\"?y=1#s\">anchor4</a>"
+ + "<a href=\"?y=1;somethingelse\">anchor5</a>"
+ + "</body></html>"),
+ };
+
+ private static int SKIP = 9;
+
+ private static String[] testBaseHrefs= {
+ "http://www.nutch.org",
+ "http://www.nutch.org/docs/foo.html",
+ "http://www.nutch.org/docs/",
+ "http://www.nutch.org/docs/",
+ "http://www.nutch.org/frames/",
+ "http://www.nutch.org/maps/",
+ "http://www.nutch.org/whitespace/",
+ "http://www.nutch.org//",
+ "http://www.nutch.org/",
+ "http://www.nutch.org/",
+ "http://www.nutch.org/",
+ "http://www.nutch.org/;something"
+ };
+
+ private static final DocumentFragment testDOMs[]=
+ new DocumentFragment[testPages.length];
+
+ private static URL[] testBaseHrefURLs=
+ new URL[testPages.length];
+
+
+ private static final String[] answerText= {
+ "title body anchor",
+ "title body home bots",
+ "separate this from this",
+ "my title body home 1 2",
+ "my title",
+ "my title the bottom",
+ "my title Whitespace test whitespace test "
+ + "This is a whitespace test . Newlines should appear as space too. "
+ + "Tabs are spaces too. This is a break -> and the line after break . "
+ + "one two three space here space there no space "
+ + "one two two three three four put some text here and there. "
+ + "End this madness ! . . . .",
+ "ignore ignore",
+ "test1 test2",
+ "test1 test2",
+ "title anchor1 anchor2 anchor3",
+ "title anchor1 anchor2 anchor3 anchor4 anchor5"
+ };
+
+ private static final String[] answerTitle= {
+ "title",
+ "title",
+ "",
+ "my title",
+ "my title",
+ "my title",
+ "my title",
+ "",
+ "",
+ "",
+ "title",
+ "title"
+ };
+
+ // note: should be in page-order
+ private static Outlink[][] answerOutlinks;
+
+ private static Configuration conf;
+ private static DOMContentUtils utils = null;
+
+ public TestDOMContentUtils(String name) {
+ super(name);
+ }
+
+ private static void setup() {
+ conf = NutchConfiguration.create();
+ conf.setBoolean("parser.html.form.use_action", true);
+ utils = new DOMContentUtils(conf);
+ DOMFragmentParser parser= new DOMFragmentParser();
+ for (int i= 0; i < testPages.length; i++) {
+ DocumentFragment node=
+ new HTMLDocumentImpl().createDocumentFragment();
+ try {
+ parser.parse(
+ new InputSource(
+ new ByteArrayInputStream(testPages[i].getBytes()) ),
+ node);
+ testBaseHrefURLs[i]= new URL(testBaseHrefs[i]);
+ } catch (Exception e) {
+ assertTrue("caught exception: " + e, false);
+ }
+ testDOMs[i]= node;
+ }
+ try {
+ answerOutlinks = new Outlink[][]{
+ {
+ new Outlink("http://www.nutch.org", "anchor", conf),
+ },
+ {
+ new Outlink("http://www.nutch.org/", "home", conf),
+ new Outlink("http://www.nutch.org/docs/bot.html", "bots", conf),
+ },
+ {
+ new Outlink("http://www.nutch.org/", "separate this", conf),
+ new Outlink("http://www.nutch.org/docs/ok", "from this", conf),
+ },
+ {
+ new Outlink("http://www.nutch.org/", "home", conf),
+ new Outlink("http://www.nutch.org/docs/1", "1", conf),
+ new Outlink("http://www.nutch.org/docs/2", "2", conf),
+ },
+ {
+ new Outlink("http://www.nutch.org/frames/top.html", "", conf),
+ new Outlink("http://www.nutch.org/frames/left.html", "", conf),
+ new Outlink("http://www.nutch.org/frames/invalid.html", "", conf),
+ new Outlink("http://www.nutch.org/frames/right.html", "", conf),
+ },
+ {
+ new Outlink("http://www.nutch.org/maps/logo.gif", "", conf),
+ new Outlink("http://www.nutch.org/index.html", "", conf),
+ new Outlink("http://www.nutch.org/maps/#bottom", "", conf),
+ new Outlink("http://www.nutch.org/bot.html", "", conf),
+ new Outlink("http://www.nutch.org/docs/index.html", "", conf),
+ },
+ {
+ new Outlink("http://www.nutch.org/index.html", "whitespace test", conf),
+ },
+ {
+ },
+ {
+ new Outlink("http://www.nutch.org/dummy.jsp", "test2", conf),
+ },
+ {
+ },
+ {
+ new Outlink("http://www.nutch.org/;x", "anchor1", conf),
+ new Outlink("http://www.nutch.org/g;x", "anchor2", conf),
+ new Outlink("http://www.nutch.org/g;x?y#s", "anchor3", conf)
+ },
+ {
+ new Outlink("http://www.nutch.org/g;something", "anchor1", conf),
+ new Outlink("http://www.nutch.org/g;something?y#s", "anchor2", conf),
+ new Outlink("http://www.nutch.org/;something?y=1", "anchor3", conf),
+ new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4", conf),
+ new Outlink("http://www.nutch.org/?y=1;somethingelse", "anchor5", conf)
+ }
+ };
+
+ } catch (MalformedURLException e) {
+
+ }
+ }
+
+ private static boolean equalsIgnoreWhitespace(String s1, String s2) {
+ StringTokenizer st1= new StringTokenizer(s1);
+ StringTokenizer st2= new StringTokenizer(s2);
+
+ while (st1.hasMoreTokens()) {
+ if (!st2.hasMoreTokens())
+ return false;
+ if ( ! st1.nextToken().equals(st2.nextToken()) )
+ return false;
+ }
+ if (st2.hasMoreTokens())
+ return false;
+ return true;
+ }
+
+ public void testGetText() {
+ if (testDOMs[0] == null)
+ setup();
+ for (int i= 0; i < testPages.length; i++) {
+ StringBuffer sb= new StringBuffer();
+ utils.getText(sb, testDOMs[i]);
+ String text= sb.toString();
+ assertTrue("expecting text: " + answerText[i]
+ + System.getProperty("line.separator")
+ + System.getProperty("line.separator")
+ + "got text: "+ text,
+ equalsIgnoreWhitespace(answerText[i], text));
+ }
+ }
+
+ public void testGetTitle() {
+ if (testDOMs[0] == null)
+ setup();
+ for (int i= 0; i < testPages.length; i++) {
+ StringBuffer sb= new StringBuffer();
+ utils.getTitle(sb, testDOMs[i]);
+ String text= sb.toString();
+ assertTrue("expecting text: " + answerText[i]
+ + System.getProperty("line.separator")
+ + System.getProperty("line.separator")
+ + "got text: "+ text,
+ equalsIgnoreWhitespace(answerTitle[i], text));
+ }
+ }
+
+ public void testGetOutlinks() {
+ if (testDOMs[0] == null)
+ setup();
+ for (int i= 0; i < testPages.length; i++) {
+ ArrayList outlinks= new ArrayList();
+ if (i == SKIP) {
+ conf.setBoolean("parser.html.form.use_action", false);
+ utils.setConf(conf);
+ } else {
+ conf.setBoolean("parser.html.form.use_action", true);
+ utils.setConf(conf);
+ }
+ utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
+ Outlink[] outlinkArr= new Outlink[outlinks.size()];
+ outlinkArr= (Outlink[]) outlinks.toArray(outlinkArr);
+ compareOutlinks(answerOutlinks[i], outlinkArr);
+ }
+ }
+
+ private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
+ for (int i= 0; i < o.length; i++) {
+ sb.append(o[i].toString());
+ sb.append(System.getProperty("line.separator"));
+ }
+ }
+
+ private static final String outlinksString(Outlink[] o) {
+ StringBuffer sb= new StringBuffer();
+ appendOutlinks(sb, o);
+ return sb.toString();
+ }
+
+ private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
+ if (o1.length != o2.length) {
+ assertTrue("got wrong number of outlinks (expecting " + o1.length
+ + ", got " + o2.length + ")"
+ + System.getProperty("line.separator")
+ + "answer: " + System.getProperty("line.separator")
+ + outlinksString(o1)
+ + System.getProperty("line.separator")
+ + "got: " + System.getProperty("line.separator")
+ + outlinksString(o2)
+ + System.getProperty("line.separator"),
+ false
+ );
+ }
+
+ for (int i= 0; i < o1.length; i++) {
+ if (!o1[i].equals(o2[i])) {
+ assertTrue("got wrong outlinks at position " + i
+ + System.getProperty("line.separator")
+ + "answer: " + System.getProperty("line.separator")
+ + o1[i].toString()
+ + System.getProperty("line.separator")
+ + "got: " + System.getProperty("line.separator")
+ + o2[i].toString(),
+ false
+ );
+
+ }
+ }
+ }
+}