You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:30 UTC
[14/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-ext/src/test/java/org/apache/nutch/parse/ext/TestExtParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/src/test/java/org/apache/nutch/parse/ext/TestExtParser.java b/nutch-plugins/parse-ext/src/test/java/org/apache/nutch/parse/ext/TestExtParser.java
new file mode 100644
index 0000000..a399273
--- /dev/null
+++ b/nutch-plugins/parse-ext/src/test/java/org/apache/nutch/parse/ext/TestExtParser.java
@@ -0,0 +1,130 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.ext;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+
+/**
+ * Unit tests for ExtParser. First creates a temp file with fixed content, then
+ * fetch and parse it using external command 'cat' and 'md5sum' alternately for
+ * 10 times. Doing so also does a light stress test for class CommandRunner.java
+ * (as used in ExtParser.java).
+ * 
+ * Warning: currently only do test on linux platform.
+ * 
+ * @author John Xing
+ */
+public class TestExtParser {
+  private File tempFile = null;
+  private String urlString = null;
+  private Content content = null;
+  private Parse parse = null;
+
+  private String expectedText = "nutch rocks nutch rocks nutch rocks";
+  // echo -n "nutch rocks nutch rocks nutch rocks" | md5sum
+  private String expectedMD5sum = "df46711a1a48caafc98b1c3b83aa1526";
+
+  @Before
+  protected void setUp() throws ProtocolException, IOException {
+    // prepare a temp file with expectedText as its content
+    // This system property is defined in ./src/plugin/build-plugin.xml
+    String path = System.getProperty("test.data");
+    if (path != null) {
+      File tempDir = new File(path);
+      if (!tempDir.exists())
+        tempDir.mkdir();
+      tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt",
+          tempDir);
+    } else {
+      // otherwise in java.io.tmpdir
+      tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt");
+    }
+    urlString = tempFile.toURI().toURL().toString();
+
+    FileOutputStream fos = new FileOutputStream(tempFile);
+    fos.write(expectedText.getBytes());
+    fos.close();
+
+    // get nutch content
+    Protocol protocol = new ProtocolFactory(NutchConfiguration.create())
+        .getProtocol(urlString);
+    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
+        .getContent();
+    protocol = null;
+  }
+
+  @After
+  protected void tearDown() {
+    // clean content
+    content = null;
+
+    // clean temp file
+    // if (tempFile != null && tempFile.exists())
+    // tempFile.delete();
+  }
+
+  @Test
+  public void testIt() throws ParseException {
+    String contentType;
+
+    // now test only on linux platform
+    if (!System.getProperty("os.name").equalsIgnoreCase("linux")) {
+      System.err
+          .println("Current OS is " + System.getProperty("os.name") + ".");
+      System.err.println("No test is run on OS other than linux.");
+      return;
+    }
+
+    Configuration conf = NutchConfiguration.create();
+    // loop alternately, total 10*2 times of invoking external command
+    for (int i = 0; i < 10; i++) {
+      // check external parser that does 'cat'
+      contentType = "application/vnd.nutch.example.cat";
+      content.setContentType(contentType);
+      parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(
+          content.getUrl());
+      Assert.assertEquals(expectedText, parse.getText());
+
+      // check external parser that does 'md5sum'
+      contentType = "application/vnd.nutch.example.md5sum";
+      content.setContentType(contentType);
+      parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(
+          content.getUrl());
+      Assert.assertTrue(parse.getText().startsWith(expectedMD5sum));
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/build.xml b/nutch-plugins/parse-html/build.xml
new file mode 100755
index 0000000..a5b99b5
--- /dev/null
+++ b/nutch-plugins/parse-html/build.xml
@@ -0,0 +1,40 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-html" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-nekohtml"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-nekohtml/*.jar" />
+    </fileset>
+  </path>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-nekohtml"/>
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/ivy.xml b/nutch-plugins/parse-html/ivy.xml
new file mode 100644
index 0000000..e8a6135
--- /dev/null
+++ b/nutch-plugins/parse-html/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+   <dependency org="org.ccil.cowan.tagsoup" name="tagsoup" rev="1.2.1"/>
+  </dependencies>
+
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/plugin.xml b/nutch-plugins/parse-html/plugin.xml
new file mode 100755
index 0000000..3be70c3
--- /dev/null
+++ b/nutch-plugins/parse-html/plugin.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parse-html"
+   name="Html Parse Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parse-html.jar">
+         <export name="*"/>
+      </library>
+      <library name="tagsoup-1.2.1.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-nekohtml"/>
+   </requires>
+
+   <extension id="org.apache.nutch.parse.html"
+              name="HtmlParse"
+              point="org.apache.nutch.parse.Parser">
+
+      <implementation id="org.apache.nutch.parse.html.HtmlParser"
+                      class="org.apache.nutch.parse.html.HtmlParser">
+        <parameter name="contentType" value="text/html|application/xhtml+xml"/>
+        <parameter name="pathSuffix" value=""/>
+      </implementation>
+
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/pom.xml b/nutch-plugins/parse-html/pom.xml
new file mode 100644
index 0000000..589155b
--- /dev/null
+++ b/nutch-plugins/parse-html/pom.xml
@@ -0,0 +1,49 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parse-html</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parse-html</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.ccil.cowan.tagsoup</groupId> <artifactId>tagsoup</artifactId> <version>1.2.1</version>
+        </dependency>
+        <dependency>
+            <groupId> net.sourceforge.nekohtml</groupId>
+            <artifactId>nekohtml</artifactId>
+            <version>1.9.22</version>
+        </dependency>
+
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMBuilder.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMBuilder.java b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMBuilder.java
new file mode 100644
index 0000000..6a1038b
--- /dev/null
+++ b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMBuilder.java
@@ -0,0 +1,766 @@
+/*
+ * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
+ * XXX distribution, org.apache.xml.utils.DOMBuilder, in order to
+ * avoid dependency on Xalan.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Id$
+ */
+package org.apache.nutch.parse.html;
+
+import java.util.Stack;
+
+import org.w3c.dom.Comment;
+import org.w3c.dom.Document;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.Text;
+import org.w3c.dom.CDATASection;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.ext.LexicalHandler;
+
+/**
+ * This class takes SAX events (in addition to some extra events that SAX
+ * doesn't handle yet) and adds the result to a document or document fragment.
+ */
+public class DOMBuilder implements ContentHandler, LexicalHandler {
+
+  /** Root document */
+  public Document m_doc;
+
+  /** Current node */
+  protected Node m_currentNode = null;
+
+  /** First node of document fragment or null if not a DocumentFragment */
+  public DocumentFragment m_docFrag = null;
+
+  /** Vector of element nodes */
+  protected Stack<Element> m_elemStack = new Stack<Element>();
+
+  /**
+   * DOMBuilder instance constructor... it will add the DOM nodes to the
+   * document fragment.
+   * 
+   * @param doc
+   *          Root document
+   * @param node
+   *          Current node
+   */
+  public DOMBuilder(Document doc, Node node) {
+    m_doc = doc;
+    m_currentNode = node;
+  }
+
+  /**
+   * DOMBuilder instance constructor... it will add the DOM nodes to the
+   * document fragment.
+   * 
+   * @param doc
+   *          Root document
+   * @param docFrag
+   *          Document fragment
+   */
+  public DOMBuilder(Document doc, DocumentFragment docFrag) {
+    m_doc = doc;
+    m_docFrag = docFrag;
+  }
+
+  /**
+   * DOMBuilder instance constructor... it will add the DOM nodes to the
+   * document.
+   * 
+   * @param doc
+   *          Root document
+   */
+  public DOMBuilder(Document doc) {
+    m_doc = doc;
+  }
+
+  /**
+   * Get the root node of the DOM being created. This is either a Document or a
+   * DocumentFragment.
+   * 
+   * @return The root document or document fragment if not null
+   */
+  public Node getRootNode() {
+    return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc;
+  }
+
+  /**
+   * Get the node currently being processed.
+   * 
+   * @return the current node being processed
+   */
+  public Node getCurrentNode() {
+    return m_currentNode;
+  }
+
+  /**
+   * Return null since there is no Writer for this class.
+   * 
+   * @return null
+   */
+  public java.io.Writer getWriter() {
+    return null;
+  }
+
+  /**
+   * Append a node to the current container.
+   * 
+   * @param newNode
+   *          New node to append
+   */
+  protected void append(Node newNode) throws org.xml.sax.SAXException {
+
+    Node currentNode = m_currentNode;
+
+    if (null != currentNode) {
+      currentNode.appendChild(newNode);
+
+      // System.out.println(newNode.getNodeName());
+    } else if (null != m_docFrag) {
+      m_docFrag.appendChild(newNode);
+    } else {
+      boolean ok = true;
+      short type = newNode.getNodeType();
+
+      if (type == Node.TEXT_NODE) {
+        String data = newNode.getNodeValue();
+
+        if ((null != data) && (data.trim().length() > 0)) {
+          throw new org.xml.sax.SAXException(
+              "Warning: can't output text before document element!  Ignoring...");
+        }
+
+        ok = false;
+      } else if (type == Node.ELEMENT_NODE) {
+        if (m_doc.getDocumentElement() != null) {
+          throw new org.xml.sax.SAXException(
+              "Can't have more than one root on a DOM!");
+        }
+      }
+
+      if (ok)
+        m_doc.appendChild(newNode);
+    }
+  }
+
+  /**
+   * Receive an object for locating the origin of SAX document events.
+   * 
+   * <p>
+   * SAX parsers are strongly encouraged (though not absolutely required) to
+   * supply a locator: if it does so, it must supply the locator to the
+   * application by invoking this method before invoking any of the other
+   * methods in the ContentHandler interface.
+   * </p>
+   * 
+   * <p>
+   * The locator allows the application to determine the end position of any
+   * document-related event, even if the parser is not reporting an error.
+   * Typically, the application will use this information for reporting its own
+   * errors (such as character content that does not match an application's
+   * business rules). The information returned by the locator is probably not
+   * sufficient for use with a search engine.
+   * </p>
+   * 
+   * <p>
+   * Note that the locator will return correct information only during the
+   * invocation of the events in this interface. The application should not
+   * attempt to use it at any other time.
+   * </p>
+   * 
+   * @param locator
+   *          An object that can return the location of any SAX document event.
+   * @see org.xml.sax.Locator
+   */
+  public void setDocumentLocator(Locator locator) {
+
+    // No action for the moment.
+  }
+
+  /**
+   * Receive notification of the beginning of a document.
+   * 
+   * <p>
+   * The SAX parser will invoke this method only once, before any other methods
+   * in this interface or in DTDHandler (except for setDocumentLocator).
+   * </p>
+   */
+  public void startDocument() throws org.xml.sax.SAXException {
+
+    // No action for the moment.
+  }
+
+  /**
+   * Receive notification of the end of a document.
+   * 
+   * <p>
+   * The SAX parser will invoke this method only once, and it will be the last
+   * method invoked during the parse. The parser shall not invoke this method
+   * until it has either abandoned parsing (because of an unrecoverable error)
+   * or reached the end of input.
+   * </p>
+   */
+  public void endDocument() throws org.xml.sax.SAXException {
+
+    // No action for the moment.
+  }
+
+  /**
+   * Receive notification of the beginning of an element.
+   * 
+   * <p>
+   * The Parser will invoke this method at the beginning of every element in the
+   * XML document; there will be a corresponding endElement() event for every
+   * startElement() event (even when the element is empty). All of the element's
+   * content will be reported, in order, before the corresponding endElement()
+   * event.
+   * </p>
+   * 
+   * <p>
+   * If the element name has a namespace prefix, the prefix will still be
+   * attached. Note that the attribute list provided will contain only
+   * attributes with explicit values (specified or defaulted): #IMPLIED
+   * attributes will be omitted.
+   * </p>
+   * 
+   * 
+   * @param ns
+   *          The namespace of the node
+   * @param localName
+   *          The local part of the qualified name
+   * @param name
+   *          The element name.
+   * @param atts
+   *          The attributes attached to the element, if any.
+   * @see #endElement
+   * @see org.xml.sax.Attributes
+   */
+  public void startElement(String ns, String localName, String name,
+      Attributes atts) throws org.xml.sax.SAXException {
+
+    Element elem;
+
+    // Note that the namespace-aware call must be used to correctly
+    // construct a Level 2 DOM, even for non-namespaced nodes.
+    if ((null == ns) || (ns.length() == 0))
+      elem = m_doc.createElementNS(null, name);
+    else
+      elem = m_doc.createElementNS(ns, name);
+
+    append(elem);
+
+    try {
+      int nAtts = atts.getLength();
+
+      if (0 != nAtts) {
+        for (int i = 0; i < nAtts; i++) {
+
+          // System.out.println("type " + atts.getType(i) + " name " +
+          // atts.getLocalName(i) );
+          // First handle a possible ID attribute
+          if (atts.getType(i).equalsIgnoreCase("ID"))
+            setIDAttribute(atts.getValue(i), elem);
+
+          String attrNS = atts.getURI(i);
+
+          if ("".equals(attrNS))
+            attrNS = null; // DOM represents no-namespace as null
+
+          // System.out.println("attrNS: "+attrNS+", localName: "+atts.getQName(i)
+          // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i));
+          // Crimson won't let us set an xmlns: attribute on the DOM.
+          String attrQName = atts.getQName(i);
+
+          // In SAX, xmlns: attributes have an empty namespace, while in DOM
+          // they should have the xmlns namespace
+          if (attrQName.startsWith("xmlns:"))
+            attrNS = "http://www.w3.org/2000/xmlns/";
+
+          // ALWAYS use the DOM Level 2 call!
+          elem.setAttributeNS(attrNS, attrQName, atts.getValue(i));
+        }
+      }
+
+      // append(elem);
+
+      m_elemStack.push(elem);
+
+      m_currentNode = elem;
+
+      // append(elem);
+    } catch (java.lang.Exception de) {
+      // de.printStackTrace();
+      throw new org.xml.sax.SAXException(de);
+    }
+
+  }
+
+  /**
+   * 
+   * 
+   * 
+   * Receive notification of the end of an element.
+   * 
+   * <p>
+   * The SAX parser will invoke this method at the end of every element in the
+   * XML document; there will be a corresponding startElement() event for every
+   * endElement() event (even when the element is empty).
+   * </p>
+   * 
+   * <p>
+   * If the element name has a namespace prefix, the prefix will still be
+   * attached to the name.
+   * </p>
+   * 
+   * 
+   * @param ns
+   *          the namespace of the element
+   * @param localName
+   *          The local part of the qualified name of the element
+   * @param name
+   *          The element name
+   */
+  public void endElement(String ns, String localName, String name)
+      throws org.xml.sax.SAXException {
+    m_elemStack.pop();
+    m_currentNode = m_elemStack.isEmpty() ? null : (Node) m_elemStack.peek();
+  }
+
+  /**
+   * Set an ID string to node association in the ID table.
+   * 
+   * @param id
+   *          The ID string.
+   * @param elem
+   *          The associated ID.
+   */
+  public void setIDAttribute(String id, Element elem) {
+
+    // Do nothing. This method is meant to be overiden.
+  }
+
+  /**
+   * Receive notification of character data.
+   * 
+   * <p>
+   * The Parser will call this method to report each chunk of character data.
+   * SAX parsers may return all contiguous character data in a single chunk, or
+   * they may split it into several chunks; however, all of the characters in
+   * any single event must come from the same external entity, so that the
+   * Locator provides useful information.
+   * </p>
+   * 
+   * <p>
+   * The application must not attempt to read from the array outside of the
+   * specified range.
+   * </p>
+   * 
+   * <p>
+   * Note that some parsers will report whitespace using the
+   * ignorableWhitespace() method rather than this one (validating parsers must
+   * do so).
+   * </p>
+   * 
+   * @param ch
+   *          The characters from the XML document.
+   * @param start
+   *          The start position in the array.
+   * @param length
+   *          The number of characters to read from the array.
+   * @see #ignorableWhitespace
+   * @see org.xml.sax.Locator
+   */
+  public void characters(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    if (isOutsideDocElem()
+        && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return; // avoid DOM006 Hierarchy request error
+
+    if (m_inCData) {
+      cdata(ch, start, length);
+
+      return;
+    }
+
+    String s = new String(ch, start, length);
+    Node childNode;
+    childNode = m_currentNode != null ? m_currentNode.getLastChild() : null;
+    if (childNode != null && childNode.getNodeType() == Node.TEXT_NODE) {
+      ((Text) childNode).appendData(s);
+    } else {
+      Text text = m_doc.createTextNode(s);
+      append(text);
+    }
+  }
+
+  /**
+   * If available, when the disable-output-escaping attribute is used, output
+   * raw text without escaping. A PI will be inserted in front of the node with
+   * the name "lotusxsl-next-is-raw" and a value of "formatter-to-dom".
+   * 
+   * @param ch
+   *          Array containing the characters
+   * @param start
+   *          Index to start of characters in the array
+   * @param length
+   *          Number of characters in the array
+   */
+  public void charactersRaw(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    if (isOutsideDocElem()
+        && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return; // avoid DOM006 Hierarchy request error
+
+    String s = new String(ch, start, length);
+
+    append(m_doc.createProcessingInstruction("xslt-next-is-raw",
+        "formatter-to-dom"));
+    append(m_doc.createTextNode(s));
+  }
+
+  /**
+   * Report the beginning of an entity.
+   * 
+   * The start and end of the document entity are not reported. The start and
+   * end of the external DTD subset are reported using the pseudo-name "[dtd]".
+   * All other events must be properly nested within start/end entity events.
+   * 
+   * @param name
+   *          The name of the entity. If it is a parameter entity, the name will
+   *          begin with '%'.
+   * @see #endEntity
+   * @see org.xml.sax.ext.DeclHandler#internalEntityDecl
+   * @see org.xml.sax.ext.DeclHandler#externalEntityDecl
+   */
+  public void startEntity(String name) throws org.xml.sax.SAXException {
+
+    // Almost certainly the wrong behavior...
+    // entityReference(name);
+  }
+
+  /**
+   * Report the end of an entity.
+   * 
+   * @param name
+   *          The name of the entity that is ending.
+   * @see #startEntity
+   */
+  public void endEntity(String name) throws org.xml.sax.SAXException {
+  }
+
+  /**
+   * Receive notivication of a entityReference.
+   * 
+   * @param name
+   *          name of the entity reference
+   */
+  public void entityReference(String name) throws org.xml.sax.SAXException {
+    append(m_doc.createEntityReference(name));
+  }
+
+  /**
+   * Receive notification of ignorable whitespace in element content.
+   * 
+   * <p>
+   * Validating Parsers must use this method to report each chunk of ignorable
+   * whitespace (see the W3C XML 1.0 recommendation, section 2.10):
+   * non-validating parsers may also use this method if they are capable of
+   * parsing and using content models.
+   * </p>
+   * 
+   * <p>
+   * SAX parsers may return all contiguous whitespace in a single chunk, or they
+   * may split it into several chunks; however, all of the characters in any
+   * single event must come from the same external entity, so that the Locator
+   * provides useful information.
+   * </p>
+   * 
+   * <p>
+   * The application must not attempt to read from the array outside of the
+   * specified range.
+   * </p>
+   * 
+   * @param ch
+   *          The characters from the XML document.
+   * @param start
+   *          The start position in the array.
+   * @param length
+   *          The number of characters to read from the array.
+   * @see #characters
+   */
+  public void ignorableWhitespace(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    if (isOutsideDocElem())
+      return; // avoid DOM006 Hierarchy request error
+
+    String s = new String(ch, start, length);
+
+    append(m_doc.createTextNode(s));
+  }
+
+  /**
+   * Tell if the current node is outside the document element.
+   * 
+   * @return true if the current node is outside the document element.
+   */
+  private boolean isOutsideDocElem() {
+    return (null == m_docFrag)
+        && m_elemStack.size() == 0
+        && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE);
+  }
+
+  /**
+   * Receive notification of a processing instruction.
+   * 
+   * <p>
+   * The Parser will invoke this method once for each processing instruction
+   * found: note that processing instructions may occur before or after the main
+   * document element.
+   * </p>
+   * 
+   * <p>
+   * A SAX parser should never report an XML declaration (XML 1.0, section 2.8)
+   * or a text declaration (XML 1.0, section 4.3.1) using this method.
+   * </p>
+   * 
+   * @param target
+   *          The processing instruction target.
+   * @param data
+   *          The processing instruction data, or null if none was supplied.
+   */
+  public void processingInstruction(String target, String data)
+      throws org.xml.sax.SAXException {
+    append(m_doc.createProcessingInstruction(target, data));
+  }
+
+  /**
+   * Report an XML comment anywhere in the document.
+   * 
+   * This callback will be used for comments inside or outside the document
+   * element, including comments in the external DTD subset (if read).
+   * 
+   * @param ch
+   *          An array holding the characters in the comment.
+   * @param start
+   *          The starting position in the array.
+   * @param length
+   *          The number of characters to use from the array.
+   */
+  public void comment(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    // tagsoup sometimes submits invalid values here
+    if (ch == null || start < 0 || length >= (ch.length - start) || length < 0)
+      return;
+    append(m_doc.createComment(new String(ch, start, length)));
+  }
+
+  /** Flag indicating that we are processing a CData section */
+  protected boolean m_inCData = false;
+
+  /**
+   * Report the start of a CDATA section.
+   * 
+   * @see #endCDATA
+   */
+  public void startCDATA() throws org.xml.sax.SAXException {
+    m_inCData = true;
+    append(m_doc.createCDATASection(""));
+  }
+
+  /**
+   * Report the end of a CDATA section.
+   * 
+   * @see #startCDATA
+   */
+  public void endCDATA() throws org.xml.sax.SAXException {
+    m_inCData = false;
+  }
+
+  /**
+   * Receive notification of cdata.
+   * 
+   * <p>
+   * The Parser will call this method to report each chunk of character data.
+   * SAX parsers may return all contiguous character data in a single chunk, or
+   * they may split it into several chunks; however, all of the characters in
+   * any single event must come from the same external entity, so that the
+   * Locator provides useful information.
+   * </p>
+   * 
+   * <p>
+   * The application must not attempt to read from the array outside of the
+   * specified range.
+   * </p>
+   * 
+   * <p>
+   * Note that some parsers will report whitespace using the
+   * ignorableWhitespace() method rather than this one (validating parsers must
+   * do so).
+   * </p>
+   * 
+   * @param ch
+   *          The characters from the XML document.
+   * @param start
+   *          The start position in the array.
+   * @param length
+   *          The number of characters to read from the array.
+   * @see #ignorableWhitespace
+   * @see org.xml.sax.Locator
+   */
+  public void cdata(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    if (isOutsideDocElem()
+        && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return; // avoid DOM006 Hierarchy request error
+
+    String s = new String(ch, start, length);
+
+    // XXX ab@apache.org: modified from the original, to accomodate TagSoup.
+    Node n = m_currentNode.getLastChild();
+    if (n instanceof CDATASection)
+      ((CDATASection) n).appendData(s);
+    else if (n instanceof Comment)
+      ((Comment) n).appendData(s);
+  }
+
+  /**
+   * Report the start of DTD declarations, if any.
+   * 
+   * Any declarations are assumed to be in the internal subset unless otherwise
+   * indicated.
+   * 
+   * @param name
+   *          The document type name.
+   * @param publicId
+   *          The declared public identifier for the external DTD subset, or
+   *          null if none was declared.
+   * @param systemId
+   *          The declared system identifier for the external DTD subset, or
+   *          null if none was declared.
+   * @see #endDTD
+   * @see #startEntity
+   */
+  public void startDTD(String name, String publicId, String systemId)
+      throws org.xml.sax.SAXException {
+
+    // Do nothing for now.
+  }
+
+  /**
+   * Report the end of DTD declarations.
+   * 
+   * @see #startDTD
+   */
+  public void endDTD() throws org.xml.sax.SAXException {
+
+    // Do nothing for now.
+  }
+
+  /**
+   * Begin the scope of a prefix-URI Namespace mapping.
+   * 
+   * <p>
+   * The information from this event is not necessary for normal Namespace
+   * processing: the SAX XML reader will automatically replace prefixes for
+   * element and attribute names when the http://xml.org/sax/features/namespaces
+   * feature is true (the default).
+   * </p>
+   * 
+   * <p>
+   * There are cases, however, when applications need to use prefixes in
+   * character data or in attribute values, where they cannot safely be expanded
+   * automatically; the start/endPrefixMapping event supplies the information to
+   * the application to expand prefixes in those contexts itself, if necessary.
+   * </p>
+   * 
+   * <p>
+   * Note that start/endPrefixMapping events are not guaranteed to be properly
+   * nested relative to each-other: all startPrefixMapping events will occur
+   * before the corresponding startElement event, and all endPrefixMapping
+   * events will occur after the corresponding endElement event, but their order
+   * is not guaranteed.
+   * </p>
+   * 
+   * @param prefix
+   *          The Namespace prefix being declared.
+   * @param uri
+   *          The Namespace URI the prefix is mapped to.
+   * @see #endPrefixMapping
+   * @see #startElement
+   */
+  public void startPrefixMapping(String prefix, String uri)
+      throws org.xml.sax.SAXException {
+
+    /*
+     * // Not sure if this is needed or wanted // Also, it fails in the stree.
+     * if((null != m_currentNode) && (m_currentNode.getNodeType() ==
+     * Node.ELEMENT_NODE)) { String qname; if(((null != prefix) &&
+     * (prefix.length() == 0)) || (null == prefix)) qname = "xmlns"; else qname
+     * = "xmlns:"+prefix;
+     * 
+     * Element elem = (Element)m_currentNode; String val =
+     * elem.getAttribute(qname); // Obsolete, should be DOM2...? if(val == null)
+     * { elem.setAttributeNS("http://www.w3.org/XML/1998/namespace", qname,
+     * uri); } }
+     */
+  }
+
+  /**
+   * End the scope of a prefix-URI mapping.
+   * 
+   * <p>
+   * See startPrefixMapping for details. This event will always occur after the
+   * corresponding endElement event, but the order of endPrefixMapping events is
+   * not otherwise guaranteed.
+   * </p>
+   * 
+   * @param prefix
+   *          The prefix that was being mapping.
+   * @see #startPrefixMapping
+   * @see #endElement
+   */
+  public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException {
+  }
+
+  /**
+   * Receive notification of a skipped entity.
+   * 
+   * <p>
+   * The Parser will invoke this method once for each entity skipped.
+   * Non-validating processors may skip entities if they have not seen the
+   * declarations (because, for example, the entity was declared in an external
+   * DTD subset). All processors may skip external entities, depending on the
+   * values of the http://xml.org/sax/features/external-general-entities and the
+   * http://xml.org/sax/features/external-parameter-entities properties.
+   * </p>
+   * 
+   * @param name
+   *          The name of the skipped entity. If it is a parameter entity, the
+   *          name will begin with '%'.
+   */
+  public void skippedEntity(String name) throws org.xml.sax.SAXException {
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMContentUtils.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMContentUtils.java b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMContentUtils.java
new file mode 100644
index 0000000..3c2aba0
--- /dev/null
+++ b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMContentUtils.java
@@ -0,0 +1,400 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import java.net.URL;
+import java.net.MalformedURLException;
+import java.util.Collection;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Stack;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.util.NodeWalker;
+import org.apache.nutch.util.URLUtil;
+import org.apache.hadoop.conf.Configuration;
+
+import org.w3c.dom.*;
+
+/**
+ * A collection of methods for extracting content from DOM trees.
+ * 
+ * This class holds a few utility methods for pulling content out of DOM nodes,
+ * such as getOutlinks, getText, etc.
+ * 
+ */
+public class DOMContentUtils {
+
+  public static class LinkParams {
+    public String elName;
+    public String attrName;
+    public int childLen;
+
+    public LinkParams(String elName, String attrName, int childLen) {
+      this.elName = elName;
+      this.attrName = attrName;
+      this.childLen = childLen;
+    }
+
+    public String toString() {
+      return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]";
+    }
+  }
+
+  private HashMap<String, LinkParams> linkParams = new HashMap<String, LinkParams>();
+  private Configuration conf;
+
+  public DOMContentUtils(Configuration conf) {
+    setConf(conf);
+  }
+
+  public void setConf(Configuration conf) {
+    // forceTags is used to override configurable tag ignoring, later on
+    Collection<String> forceTags = new ArrayList<String>(1);
+
+    this.conf = conf;
+    linkParams.clear();
+    linkParams.put("a", new LinkParams("a", "href", 1));
+    linkParams.put("area", new LinkParams("area", "href", 0));
+    if (conf.getBoolean("parser.html.form.use_action", true)) {
+      linkParams.put("form", new LinkParams("form", "action", 1));
+      if (conf.get("parser.html.form.use_action") != null)
+        forceTags.add("form");
+    }
+    linkParams.put("frame", new LinkParams("frame", "src", 0));
+    linkParams.put("iframe", new LinkParams("iframe", "src", 0));
+    linkParams.put("script", new LinkParams("script", "src", 0));
+    linkParams.put("link", new LinkParams("link", "href", 0));
+    linkParams.put("img", new LinkParams("img", "src", 0));
+
+    // remove unwanted link tags from the linkParams map
+    String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags");
+    for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) {
+      if (!forceTags.contains(ignoreTags[i]))
+        linkParams.remove(ignoreTags[i]);
+    }
+  }
+
+  /**
+   * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
+   * append all the content text found beneath the DOM node to the
+   * <code>StringBuffer</code>.
+   * 
+   * <p>
+   * 
+   * If <code>abortOnNestedAnchors</code> is true, DOM traversal will be aborted
+   * and the <code>StringBuffer</code> will not contain any text encountered
+   * after a nested anchor is found.
+   * 
+   * <p>
+   * 
+   * @return true if nested anchors were found
+   */
+  public boolean getText(StringBuffer sb, Node node,
+      boolean abortOnNestedAnchors) {
+    if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
+      return true;
+    }
+    return false;
+  }
+
+  /**
+   * This is a convinience method, equivalent to
+   * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
+   * 
+   */
+  public void getText(StringBuffer sb, Node node) {
+    getText(sb, node, false);
+  }
+
+  // returns true if abortOnNestedAnchors is true and we find nested
+  // anchors
+  private boolean getTextHelper(StringBuffer sb, Node node,
+      boolean abortOnNestedAnchors, int anchorDepth) {
+    boolean abort = false;
+    NodeWalker walker = new NodeWalker(node);
+
+    while (walker.hasNext()) {
+
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+
+      if ("script".equalsIgnoreCase(nodeName)) {
+        walker.skipChildren();
+      }
+      if ("style".equalsIgnoreCase(nodeName)) {
+        walker.skipChildren();
+      }
+      if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
+        anchorDepth++;
+        if (anchorDepth > 1) {
+          abort = true;
+          break;
+        }
+      }
+      if (nodeType == Node.COMMENT_NODE) {
+        walker.skipChildren();
+      }
+      if (nodeType == Node.TEXT_NODE) {
+        // cleanup and trim the value
+        String text = currentNode.getNodeValue();
+        text = text.replaceAll("\\s+", " ");
+        text = text.trim();
+        if (text.length() > 0) {
+          if (sb.length() > 0)
+            sb.append(' ');
+          sb.append(text);
+        }
+      }
+    }
+
+    return abort;
+  }
+
+  /**
+   * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
+   * append the content text found beneath the first <code>title</code> node to
+   * the <code>StringBuffer</code>.
+   * 
+   * @return true if a title node was found, false otherwise
+   */
+  public boolean getTitle(StringBuffer sb, Node node) {
+
+    NodeWalker walker = new NodeWalker(node);
+
+    while (walker.hasNext()) {
+
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+
+      if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
+        return false;
+      }
+
+      if (nodeType == Node.ELEMENT_NODE) {
+        if ("title".equalsIgnoreCase(nodeName)) {
+          getText(sb, currentNode);
+          return true;
+        }
+      }
+    }
+
+    return false;
+  }
+
+  /** If Node contains a BASE tag then it's HREF is returned. */
+  public URL getBase(Node node) {
+
+    NodeWalker walker = new NodeWalker(node);
+
+    while (walker.hasNext()) {
+
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+
+      // is this node a BASE tag?
+      if (nodeType == Node.ELEMENT_NODE) {
+
+        if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
+          return null;
+        }
+
+        if ("base".equalsIgnoreCase(nodeName)) {
+          NamedNodeMap attrs = currentNode.getAttributes();
+          for (int i = 0; i < attrs.getLength(); i++) {
+            Node attr = attrs.item(i);
+            if ("href".equalsIgnoreCase(attr.getNodeName())) {
+              try {
+                return new URL(attr.getNodeValue());
+              } catch (MalformedURLException e) {
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // no.
+    return null;
+  }
+
+  private boolean hasOnlyWhiteSpace(Node node) {
+    String val = node.getNodeValue();
+    for (int i = 0; i < val.length(); i++) {
+      if (!Character.isWhitespace(val.charAt(i)))
+        return false;
+    }
+    return true;
+  }
+
+  // this only covers a few cases of empty links that are symptomatic
+  // of nekohtml's DOM-fixup process...
+  private boolean shouldThrowAwayLink(Node node, NodeList children,
+      int childLen, LinkParams params) {
+    if (childLen == 0) {
+      // this has no inner structure
+      if (params.childLen == 0)
+        return false;
+      else
+        return true;
+    } else if ((childLen == 1)
+        && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
+        && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
+      // single nested link
+      return true;
+
+    } else if (childLen == 2) {
+
+      Node c0 = children.item(0);
+      Node c1 = children.item(1);
+
+      if ((c0.getNodeType() == Node.ELEMENT_NODE)
+          && (params.elName.equalsIgnoreCase(c0.getNodeName()))
+          && (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) {
+        // single link followed by whitespace node
+        return true;
+      }
+
+      if ((c1.getNodeType() == Node.ELEMENT_NODE)
+          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
+          && (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) {
+        // whitespace node followed by single link
+        return true;
+      }
+
+    } else if (childLen == 3) {
+      Node c0 = children.item(0);
+      Node c1 = children.item(1);
+      Node c2 = children.item(2);
+
+      if ((c1.getNodeType() == Node.ELEMENT_NODE)
+          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
+          && (c0.getNodeType() == Node.TEXT_NODE)
+          && (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)
+          && hasOnlyWhiteSpace(c2)) {
+        // single link surrounded by whitespace nodes
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  /**
+   * This method finds all anchors below the supplied DOM <code>node</code>, and
+   * creates appropriate {@link Outlink} records for each (relative to the
+   * supplied <code>base</code> URL), and adds them to the <code>outlinks</code>
+   * {@link ArrayList}.
+   * 
+   * <p>
+   * 
+   * Links without inner structure (tags, text, etc) are discarded, as are links
+   * which contain only single nested links and empty text nodes (this is a
+   * common DOM-fixup artifact, at least with nekohtml).
+   */
+  public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
+
+    NodeWalker walker = new NodeWalker(node);
+    while (walker.hasNext()) {
+
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+      NodeList children = currentNode.getChildNodes();
+      int childLen = (children != null) ? children.getLength() : 0;
+
+      if (nodeType == Node.ELEMENT_NODE) {
+
+        nodeName = nodeName.toLowerCase();
+        LinkParams params = (LinkParams) linkParams.get(nodeName);
+        if (params != null) {
+          if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
+
+            StringBuffer linkText = new StringBuffer();
+            getText(linkText, currentNode, true);
+            if (linkText.toString().trim().length() == 0) {
+              // try harder - use img alt if present
+              NodeWalker subWalker = new NodeWalker(currentNode);
+              while (subWalker.hasNext()) {
+                Node subNode = subWalker.nextNode();
+                if (subNode.getNodeType() == Node.ELEMENT_NODE) {
+                  if (subNode.getNodeName().toLowerCase().equals("img")) {
+                    NamedNodeMap subAttrs = subNode.getAttributes();
+                    Node alt = subAttrs.getNamedItem("alt");
+                    if (alt != null) {
+                      String altTxt = alt.getTextContent();
+                      if (altTxt != null && altTxt.trim().length() > 0) {
+                        if (linkText.length() > 0)
+                          linkText.append(' ');
+                        linkText.append(altTxt);
+                      }
+                    }
+                  } else {
+                    // ignore other types of elements
+
+                  }
+                } else if (subNode.getNodeType() == Node.TEXT_NODE) {
+                  String txt = subNode.getTextContent();
+                  if (txt != null && txt.length() > 0) {
+                    if (linkText.length() > 0)
+                      linkText.append(' ');
+                    linkText.append(txt);
+                  }
+                }
+              }
+            }
+
+            NamedNodeMap attrs = currentNode.getAttributes();
+            String target = null;
+            boolean noFollow = false;
+            boolean post = false;
+            for (int i = 0; i < attrs.getLength(); i++) {
+              Node attr = attrs.item(i);
+              String attrName = attr.getNodeName();
+              if (params.attrName.equalsIgnoreCase(attrName)) {
+                target = attr.getNodeValue();
+              } else if ("rel".equalsIgnoreCase(attrName)
+                  && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+                noFollow = true;
+              } else if ("method".equalsIgnoreCase(attrName)
+                  && "post".equalsIgnoreCase(attr.getNodeValue())) {
+                post = true;
+              }
+            }
+            if (target != null && !noFollow && !post)
+              try {
+
+                URL url = URLUtil.resolveURL(base, target);
+                outlinks.add(new Outlink(url.toString(), linkText.toString()
+                    .trim()));
+              } catch (MalformedURLException e) {
+                // don't care
+              }
+          }
+          // this should not have any children, skip them
+          if (params.childLen == 0)
+            continue;
+        }
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
new file mode 100644
index 0000000..159aa76
--- /dev/null
+++ b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
@@ -0,0 +1,214 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import java.net.URL;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.w3c.dom.*;
+
+/**
+ * Class for parsing META Directives from DOM trees. This class handles
+ * specifically Robots META directives (all, none, nofollow, noindex), finding
+ * BASE HREF tags, and HTTP-EQUIV no-cache instructions. All meta directives are
+ * stored in a HTMLMetaTags instance.
+ */
+public class HTMLMetaProcessor {
+
+  /**
+   * Utility class with indicators for the robots directives "noindex" and
+   * "nofollow", and HTTP-EQUIV/no-cache
+   */
+
+  /**
+   * Sets the indicators in <code>robotsMeta</code> to appropriate values, based
+   * on any META tags found under the given <code>node</code>.
+   */
+  public static final void getMetaTags(HTMLMetaTags metaTags, Node node,
+      URL currURL) {
+
+    metaTags.reset();
+    getMetaTagsHelper(metaTags, node, currURL);
+  }
+
+  private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node,
+      URL currURL) {
+
+    if (node.getNodeType() == Node.ELEMENT_NODE) {
+
+      if ("body".equalsIgnoreCase(node.getNodeName())) {
+        // META tags should not be under body
+        return;
+      }
+
+      if ("meta".equalsIgnoreCase(node.getNodeName())) {
+        NamedNodeMap attrs = node.getAttributes();
+        Node nameNode = null;
+        Node equivNode = null;
+        Node contentNode = null;
+        // Retrieves name, http-equiv and content attribues
+        for (int i = 0; i < attrs.getLength(); i++) {
+          Node attr = attrs.item(i);
+          String attrName = attr.getNodeName().toLowerCase();
+          if (attrName.equals("name")) {
+            nameNode = attr;
+          } else if (attrName.equals("http-equiv")) {
+            equivNode = attr;
+          } else if (attrName.equals("content")) {
+            contentNode = attr;
+          }
+        }
+
+        if (nameNode != null) {
+          if (contentNode != null) {
+            String name = nameNode.getNodeValue().toLowerCase();
+            metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
+            if ("robots".equals(name)) {
+
+              if (contentNode != null) {
+                String directives = contentNode.getNodeValue().toLowerCase();
+                int index = directives.indexOf("none");
+
+                if (index >= 0) {
+                  metaTags.setNoIndex();
+                  metaTags.setNoFollow();
+                }
+
+                index = directives.indexOf("all");
+                if (index >= 0) {
+                  // do nothing...
+                }
+
+                index = directives.indexOf("noindex");
+                if (index >= 0) {
+                  metaTags.setNoIndex();
+                }
+
+                index = directives.indexOf("nofollow");
+                if (index >= 0) {
+                  metaTags.setNoFollow();
+                }
+
+                index = directives.indexOf("noarchive");
+                if (index >= 0) {
+                  metaTags.setNoCache();
+                }
+              }
+
+            } // end if (name == robots)
+          }
+        }
+
+        if (equivNode != null) {
+          if (contentNode != null) {
+            String name = equivNode.getNodeValue().toLowerCase();
+            String content = contentNode.getNodeValue();
+            metaTags.getHttpEquivTags().setProperty(name, content);
+            if ("pragma".equals(name)) {
+              content = content.toLowerCase();
+              int index = content.indexOf("no-cache");
+              if (index >= 0)
+                metaTags.setNoCache();
+            } else if ("refresh".equals(name)) {
+              int idx = content.indexOf(';');
+              String time = null;
+              if (idx == -1) { // just the refresh time
+                time = content;
+              } else
+                time = content.substring(0, idx);
+              try {
+                metaTags.setRefreshTime(Integer.parseInt(time));
+                // skip this if we couldn't parse the time
+                metaTags.setRefresh(true);
+              } catch (Exception e) {
+                ;
+              }
+              URL refreshUrl = null;
+              if (metaTags.getRefresh() && idx != -1) { // set the URL
+                idx = content.toLowerCase().indexOf("url=");
+                if (idx == -1) { // assume a mis-formatted entry with just the
+                                 // url
+                  idx = content.indexOf(';') + 1;
+                } else
+                  idx += 4;
+                if (idx != -1) {
+                  String url = content.substring(idx);
+                  try {
+                    refreshUrl = new URL(url);
+                  } catch (Exception e) {
+                    // XXX according to the spec, this has to be an absolute
+                    // XXX url. However, many websites use relative URLs and
+                    // XXX expect browsers to handle that.
+                    // XXX Unfortunately, in some cases this may create a
+                    // XXX infinitely recursive paths (a crawler trap)...
+                    // if (!url.startsWith("/")) url = "/" + url;
+                    try {
+                      refreshUrl = new URL(currURL, url);
+                    } catch (Exception e1) {
+                      refreshUrl = null;
+                    }
+                  }
+                }
+              }
+              if (metaTags.getRefresh()) {
+                if (refreshUrl == null) {
+                  // apparently only refresh time was present. set the URL
+                  // to the same URL.
+                  refreshUrl = currURL;
+                }
+                metaTags.setRefreshHref(refreshUrl);
+              }
+            }
+          }
+        }
+
+      } else if ("base".equalsIgnoreCase(node.getNodeName())) {
+        NamedNodeMap attrs = node.getAttributes();
+        Node hrefNode = attrs.getNamedItem("href");
+
+        if (hrefNode != null) {
+          String urlString = hrefNode.getNodeValue();
+
+          URL url = null;
+          try {
+            if (currURL == null)
+              url = new URL(urlString);
+            else
+              url = new URL(currURL, urlString);
+          } catch (Exception e) {
+            ;
+          }
+
+          if (url != null)
+            metaTags.setBaseHref(url);
+        }
+
+      }
+
+    }
+
+    NodeList children = node.getChildNodes();
+    if (children != null) {
+      int len = children.getLength();
+      for (int i = 0; i < len; i++) {
+        getMetaTagsHelper(metaTags, children.item(i), currURL);
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HtmlParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HtmlParser.java b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HtmlParser.java
new file mode 100644
index 0000000..4d043ba
--- /dev/null
+++ b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HtmlParser.java
@@ -0,0 +1,352 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import java.util.ArrayList;
+import java.util.Map;
+import java.net.URL;
+import java.net.MalformedURLException;
+import java.nio.charset.StandardCharsets;
+import java.io.*;
+import java.util.regex.*;
+
+import org.cyberneko.html.parsers.*;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.protocol.Content;
+import org.apache.hadoop.conf.*;
+import org.apache.nutch.parse.*;
+import org.apache.nutch.util.*;
+
+public class HtmlParser implements Parser {
+  public static final Logger LOG = LoggerFactory
+      .getLogger("org.apache.nutch.parse.html");
+
+  // I used 1000 bytes at first, but found that some documents have
+  // meta tag well past the first 1000 bytes.
+  // (e.g. http://cn.promo.yahoo.com/customcare/music.html)
+  // NUTCH-2042 (cf. TIKA-357): increased to 8 kB
+  private static final int CHUNK_SIZE = 8192;
+
+  // NUTCH-1006 Meta equiv with single quotes not accepted
+  private static Pattern metaPattern = Pattern.compile(
+      "<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>",
+      Pattern.CASE_INSENSITIVE);
+  private static Pattern charsetPattern = Pattern.compile(
+      "charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE);
+  private static Pattern charsetPatternHTML5 = Pattern.compile(
+      "<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>",
+      Pattern.CASE_INSENSITIVE);
+
+  private String parserImpl;
+
+  /**
+   * Given a <code>byte[]</code> representing an html file of an
+   * <em>unknown</em> encoding, read out 'charset' parameter in the meta tag
+   * from the first <code>CHUNK_SIZE</code> bytes. If there's no meta tag for
+   * Content-Type or no charset is specified, the content is checked for a
+   * Unicode Byte Order Mark (BOM). This will also cover non-byte oriented
+   * character encodings (UTF-16 only). If no character set can be determined,
+   * <code>null</code> is returned. <br />
+   * See also
+   * http://www.w3.org/International/questions/qa-html-encoding-declarations,
+   * http://www.w3.org/TR/2011/WD-html5-diff-20110405/#character-encoding, and
+   * http://www.w3.org/TR/REC-xml/#sec-guessing
+   * 
+   * @param content
+   *          <code>byte[]</code> representation of an html file
+   */
+
+  private static String sniffCharacterEncoding(byte[] content) {
+    int length = content.length < CHUNK_SIZE ? content.length : CHUNK_SIZE;
+
+    // We don't care about non-ASCII parts so that it's sufficient
+    // to just inflate each byte to a 16-bit value by padding.
+    // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
+    // {U+0041, U+0082, U+00B7}.
+    String str = new String(content, 0, length, StandardCharsets.US_ASCII);
+
+    Matcher metaMatcher = metaPattern.matcher(str);
+    String encoding = null;
+    if (metaMatcher.find()) {
+      Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
+      if (charsetMatcher.find())
+        encoding = new String(charsetMatcher.group(1));
+    }
+    if (encoding == null) {
+      // check for HTML5 meta charset
+      metaMatcher = charsetPatternHTML5.matcher(str);
+      if (metaMatcher.find()) {
+        encoding = new String(metaMatcher.group(1));
+      }
+    }
+    if (encoding == null) {
+      // check for BOM
+      if (content.length >= 3 && content[0] == (byte) 0xEF
+          && content[1] == (byte) 0xBB && content[2] == (byte) 0xBF) {
+        encoding = "UTF-8";
+      } else if (content.length >= 2) {
+        if (content[0] == (byte) 0xFF && content[1] == (byte) 0xFE) {
+          encoding = "UTF-16LE";
+        } else if (content[0] == (byte) 0xFE && content[1] == (byte) 0xFF) {
+          encoding = "UTF-16BE";
+        }
+      }
+    }
+
+    return encoding;
+  }
+
+  private String defaultCharEncoding;
+
+  private Configuration conf;
+
+  private DOMContentUtils utils;
+
+  private HtmlParseFilters htmlParseFilters;
+
+  private String cachingPolicy;
+
+  public ParseResult getParse(Content content) {
+    HTMLMetaTags metaTags = new HTMLMetaTags();
+
+    URL base;
+    try {
+      base = new URL(content.getBaseUrl());
+    } catch (MalformedURLException e) {
+      return new ParseStatus(e)
+          .getEmptyParseResult(content.getUrl(), getConf());
+    }
+
+    String text = "";
+    String title = "";
+    Outlink[] outlinks = new Outlink[0];
+    Metadata metadata = new Metadata();
+
+    // parse the content
+    DocumentFragment root;
+    try {
+      byte[] contentInOctets = content.getContent();
+      InputSource input = new InputSource(new ByteArrayInputStream(
+          contentInOctets));
+
+      EncodingDetector detector = new EncodingDetector(conf);
+      detector.autoDetectClues(content, true);
+      detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
+      String encoding = detector.guessEncoding(content, defaultCharEncoding);
+
+      metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
+      metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
+
+      input.setEncoding(encoding);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Parsing...");
+      }
+      root = parse(input);
+    } catch (IOException e) {
+      return new ParseStatus(e)
+          .getEmptyParseResult(content.getUrl(), getConf());
+    } catch (DOMException e) {
+      return new ParseStatus(e)
+          .getEmptyParseResult(content.getUrl(), getConf());
+    } catch (SAXException e) {
+      return new ParseStatus(e)
+          .getEmptyParseResult(content.getUrl(), getConf());
+    } catch (Exception e) {
+      LOG.error("Error: ", e);
+      return new ParseStatus(e)
+          .getEmptyParseResult(content.getUrl(), getConf());
+    }
+
+    // get meta directives
+    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
+
+    // populate Nutch metadata with HTML meta directives
+    metadata.addAll(metaTags.getGeneralTags());
+
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
+    }
+    // check meta directives
+    if (!metaTags.getNoIndex()) { // okay to index
+      StringBuffer sb = new StringBuffer();
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Getting text...");
+      }
+      utils.getText(sb, root); // extract text
+      text = sb.toString();
+      sb.setLength(0);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Getting title...");
+      }
+      utils.getTitle(sb, root); // extract title
+      title = sb.toString().trim();
+    }
+
+    if (!metaTags.getNoFollow()) { // okay to follow links
+      ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
+      URL baseTag = utils.getBase(root);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Getting links...");
+      }
+      utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
+      outlinks = l.toArray(new Outlink[l.size()]);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("found " + outlinks.length + " outlinks in "
+            + content.getUrl());
+      }
+    }
+
+    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
+    if (metaTags.getRefresh()) {
+      status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
+      status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
+          Integer.toString(metaTags.getRefreshTime()) });
+    }
+    ParseData parseData = new ParseData(status, title, outlinks,
+        content.getMetadata(), metadata);
+    ParseResult parseResult = ParseResult.createParseResult(content.getUrl(),
+        new ParseImpl(text, parseData));
+
+    // run filters on parse
+    ParseResult filteredParse = this.htmlParseFilters.filter(content,
+        parseResult, metaTags, root);
+    if (metaTags.getNoCache()) { // not okay to cache
+      for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
+        entry.getValue().getData().getParseMeta()
+            .set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
+    }
+    return filteredParse;
+  }
+
+  private DocumentFragment parse(InputSource input) throws Exception {
+    if (parserImpl.equalsIgnoreCase("tagsoup"))
+      return parseTagSoup(input);
+    else
+      return parseNeko(input);
+  }
+
+  private DocumentFragment parseTagSoup(InputSource input) throws Exception {
+    HTMLDocumentImpl doc = new HTMLDocumentImpl();
+    DocumentFragment frag = doc.createDocumentFragment();
+    DOMBuilder builder = new DOMBuilder(doc, frag);
+    org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser();
+    reader.setContentHandler(builder);
+    reader.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
+    reader.setFeature(org.ccil.cowan.tagsoup.Parser.bogonsEmptyFeature, false);
+    reader
+        .setProperty("http://xml.org/sax/properties/lexical-handler", builder);
+    reader.parse(input);
+    return frag;
+  }
+
+  private DocumentFragment parseNeko(InputSource input) throws Exception {
+    DOMFragmentParser parser = new DOMFragmentParser();
+    try {
+      parser
+          .setFeature(
+              "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
+              true);
+      parser.setFeature("http://cyberneko.org/html/features/augmentations",
+          true);
+      parser.setProperty(
+          "http://cyberneko.org/html/properties/default-encoding",
+          defaultCharEncoding);
+      parser
+          .setFeature(
+              "http://cyberneko.org/html/features/scanner/ignore-specified-charset",
+              true);
+      parser
+          .setFeature(
+              "http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
+              false);
+      parser.setFeature(
+          "http://cyberneko.org/html/features/balance-tags/document-fragment",
+          true);
+      parser.setFeature("http://cyberneko.org/html/features/report-errors",
+          LOG.isTraceEnabled());
+    } catch (SAXException e) {
+    }
+    // convert Document to DocumentFragment
+    HTMLDocumentImpl doc = new HTMLDocumentImpl();
+    doc.setErrorChecking(false);
+    DocumentFragment res = doc.createDocumentFragment();
+    DocumentFragment frag = doc.createDocumentFragment();
+    parser.parse(input, frag);
+    res.appendChild(frag);
+
+    try {
+      while (true) {
+        frag = doc.createDocumentFragment();
+        parser.parse(input, frag);
+        if (!frag.hasChildNodes())
+          break;
+        if (LOG.isInfoEnabled()) {
+          LOG.info(" - new frag, " + frag.getChildNodes().getLength()
+              + " nodes.");
+        }
+        res.appendChild(frag);
+      }
+    } catch (Exception e) {
+      LOG.error("Error: ", e);
+    }
+    ;
+    return res;
+  }
+
+  public static void main(String[] args) throws Exception {
+    // LOG.setLevel(Level.FINE);
+    String name = args[0];
+    String url = "file:" + name;
+    File file = new File(name);
+    byte[] bytes = new byte[(int) file.length()];
+    DataInputStream in = new DataInputStream(new FileInputStream(file));
+    in.readFully(bytes);
+    Configuration conf = NutchConfiguration.create();
+    HtmlParser parser = new HtmlParser();
+    parser.setConf(conf);
+    Parse parse = parser.getParse(
+        new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(
+        url);
+    System.out.println("data: " + parse.getData());
+
+    System.out.println("text: " + parse.getText());
+
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.htmlParseFilters = new HtmlParseFilters(getConf());
+    this.parserImpl = getConf().get("parser.html.impl", "neko");
+    this.defaultCharEncoding = getConf().get(
+        "parser.character.encoding.default", "windows-1252");
+    this.utils = new DOMContentUtils(conf);
+    this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
+        Nutch.CACHING_FORBIDDEN_CONTENT);
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
new file mode 100644
index 0000000..eb382e8
--- /dev/null
+++ b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
@@ -0,0 +1,112 @@
+/*
+ * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
+ * XXX distribution, org.apache.xml.utils.XMLCharacterRecognizer,
+ * XXX in order to avoid dependency on Xalan.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Id$
+ */
+package org.apache.nutch.parse.html;
+
+/**
+ * Class used to verify whether the specified <var>ch</var> conforms to the XML
+ * 1.0 definition of whitespace.
+ */
+public class XMLCharacterRecognizer {
+
+  /**
+   * Returns whether the specified <var>ch</var> conforms to the XML 1.0
+   * definition of whitespace. Refer to <A
+   * href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S"> the definition of
+   * <CODE>S</CODE></A> for details.
+   * 
+   * @param ch
+   *          Character to check as XML whitespace.
+   * @return =true if <var>ch</var> is XML whitespace; otherwise =false.
+   */
+  public static boolean isWhiteSpace(char ch) {
+    return (ch == 0x20) || (ch == 0x09) || (ch == 0xD) || (ch == 0xA);
+  }
+
+  /**
+   * Tell if the string is whitespace.
+   * 
+   * @param ch
+   *          Character array to check as XML whitespace.
+   * @param start
+   *          Start index of characters in the array
+   * @param length
+   *          Number of characters in the array
+   * @return True if the characters in the array are XML whitespace; otherwise,
+   *         false.
+   */
+  public static boolean isWhiteSpace(char ch[], int start, int length) {
+
+    int end = start + length;
+
+    for (int s = start; s < end; s++) {
+      if (!isWhiteSpace(ch[s]))
+        return false;
+    }
+
+    return true;
+  }
+
+  /**
+   * Tell if the string is whitespace.
+   * 
+   * @param buf
+   *          StringBuffer to check as XML whitespace.
+   * @return True if characters in buffer are XML whitespace, false otherwise
+   */
+  public static boolean isWhiteSpace(StringBuffer buf) {
+
+    int n = buf.length();
+
+    for (int i = 0; i < n; i++) {
+      if (!isWhiteSpace(buf.charAt(i)))
+        return false;
+    }
+
+    return true;
+  }
+
+  /**
+   * Tell if the string is whitespace.
+   * 
+   * @param s
+   *          String to check as XML whitespace.
+   * @return True if characters in buffer are XML whitespace, false otherwise
+   */
+  public static boolean isWhiteSpace(String s) {
+
+    if (null != s) {
+      int n = s.length();
+
+      for (int i = 0; i < n; i++) {
+        if (!isWhiteSpace(s.charAt(i)))
+          return false;
+      }
+    }
+
+    return true;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/package.html b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/package.html
new file mode 100644
index 0000000..c650389
--- /dev/null
+++ b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>An HTML document parsing plugin.</p><p>This package relies on <a href="http://www.apache.org/~andyc/neko/doc/html/index.html">NekoHTML</a>.</p>
+</body>
+</html>