You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2005/06/02 00:20:05 UTC
svn commit: r179436 [2/3] - in /incubator/nutch/trunk: ./ conf/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/protocol/ src/java/org/apache/nutch/tools/ src/plugin/ src/plugin/creativecommons/src/java/org/creativecommons/nutch/ src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/ src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/ src/plugin/parse-html/ src/plugin/parse-html/lib/ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/ src/plugin/parse-html/src/test/org/apache/nutch/parse/html/ src/plugin/parse-js/ src/plugin/parse-js/src/ src/plugin/parse-js/src/java/ src/plugin/parse-js/src/java/org/ src/plugin/parse-js/src/java/org/apache/ src/plugin/parse-js/src/java/org/apache/nutch/ src/plugin/parse-js/src/java/org/apache/nutch/parse/ src/plugin/parse-js/src/java/org/apache/nutch/parse/js/ src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/ src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/ src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/ src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/ src/plugin/parse-text/src/java/org/apache/nutch/parse/text/ src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/ src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/ src/plugin/protocol-httpclient/ src/plugin/protocol-httpclient/lib/ src/plugin/protocol-httpclient/src/ src/plugin/protocol-httpclient/src/java/ src/plugin/protocol-httpclient/src/java/org/ src/plugin/protocol-httpclient/src/java/org/apache/ src/plugin/protocol-httpclient/src/java/org/apache/nutch/ src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/ src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/ src/test/org/apache/nutch/fetcher/ src/test/org/apache/nutch/parse/ src/test/org/apache/nutch/tools/

Added: incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java (added)
+++ incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java Wed Jun  1 15:20:01 2005
@@ -0,0 +1,738 @@
+/*
+ * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
+ * XXX distribution, org.apache.xml.utils.DOMBuilder, in order to
+ * avoid dependency on Xalan.
+ */
+
+/*
+ * Copyright 1999-2004 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Id: DOMBuilder.java,v 1.19 2004/02/25 13:07:51 aruny Exp $
+ */
+package org.apache.nutch.parse.html;
+
+import java.util.Stack;
+
+import org.w3c.dom.Comment;
+import org.w3c.dom.Document;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.Text;
+import org.w3c.dom.CDATASection;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.ext.LexicalHandler;
+/**
+ * This class takes SAX events (in addition to some extra events
+ * that SAX doesn't handle yet) and adds the result to a document
+ * or document fragment.
+ * @xsl.usage general
+ */
+public class DOMBuilder
+        implements ContentHandler, LexicalHandler
+{
+
+  /** Root document          */
+  public Document m_doc;
+
+  /** Current node           */
+  protected Node m_currentNode = null;
+
+  /** First node of document fragment or null if not a DocumentFragment     */
+  public DocumentFragment m_docFrag = null;
+
+  /** Vector of element nodes          */
+  protected Stack m_elemStack = new Stack();
+
+  /**
+   * DOMBuilder instance constructor... it will add the DOM nodes
+   * to the document fragment.
+   *
+   * @param doc Root document
+   * @param node Current node
+   */
+  public DOMBuilder(Document doc, Node node)
+  {
+    m_doc = doc;
+    m_currentNode = node;
+  }
+
+  /**
+   * DOMBuilder instance constructor... it will add the DOM nodes
+   * to the document fragment.
+   *
+   * @param doc Root document
+   * @param docFrag Document fragment
+   */
+  public DOMBuilder(Document doc, DocumentFragment docFrag)
+  {
+    m_doc = doc;
+    m_docFrag = docFrag;
+  }
+
+  /**
+   * DOMBuilder instance constructor... it will add the DOM nodes
+   * to the document.
+   *
+   * @param doc Root document
+   */
+  public DOMBuilder(Document doc)
+  {
+    m_doc = doc;
+  }
+
+  /**
+   * Get the root node of the DOM being created.  This
+   * is either a Document or a DocumentFragment.
+   *
+   * @return The root document or document fragment if not null
+   */
+  public Node getRootNode()
+  {
+    return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc;
+  }
+
+  /**
+   * Get the node currently being processed.
+   *
+   * @return the current node being processed
+   */
+  public Node getCurrentNode()
+  {
+    return m_currentNode;
+  }
+
+  /**
+   * Return null since there is no Writer for this class.
+   *
+   * @return null
+   */
+  public java.io.Writer getWriter()
+  {
+    return null;
+  }
+
+  /**
+   * Append a node to the current container.
+   *
+   * @param newNode New node to append
+   */
+  protected void append(Node newNode) throws org.xml.sax.SAXException
+  {
+
+    Node currentNode = m_currentNode;
+
+    if (null != currentNode)
+    {
+      currentNode.appendChild(newNode);
+
+      // System.out.println(newNode.getNodeName());
+    }
+    else if (null != m_docFrag)
+    {
+      m_docFrag.appendChild(newNode);
+    }
+    else
+    {
+      boolean ok = true;
+      short type = newNode.getNodeType();
+
+      if (type == Node.TEXT_NODE)
+      {
+        String data = newNode.getNodeValue();
+
+        if ((null != data) && (data.trim().length() > 0))
+        {
+          throw new org.xml.sax.SAXException("Warning: can't output text before document element!  Ignoring...");
+        }
+
+        ok = false;
+      }
+      else if (type == Node.ELEMENT_NODE)
+      {
+        if (m_doc.getDocumentElement() != null)
+        {
+          throw new org.xml.sax.SAXException("Can't have more than one root on a DOM!");
+        }
+      }
+
+      if (ok)
+        m_doc.appendChild(newNode);
+    }
+  }
+
+  /**
+   * Receive an object for locating the origin of SAX document events.
+   *
+   * <p>SAX parsers are strongly encouraged (though not absolutely
+   * required) to supply a locator: if it does so, it must supply
+   * the locator to the application by invoking this method before
+   * invoking any of the other methods in the ContentHandler
+   * interface.</p>
+   *
+   * <p>The locator allows the application to determine the end
+   * position of any document-related event, even if the parser is
+   * not reporting an error.  Typically, the application will
+   * use this information for reporting its own errors (such as
+   * character content that does not match an application's
+   * business rules).  The information returned by the locator
+   * is probably not sufficient for use with a search engine.</p>
+   *
+   * <p>Note that the locator will return correct information only
+   * during the invocation of the events in this interface.  The
+   * application should not attempt to use it at any other time.</p>
+   *
+   * @param locator An object that can return the location of
+   *                any SAX document event.
+   * @see org.xml.sax.Locator
+   */
+  public void setDocumentLocator(Locator locator)
+  {
+
+    // No action for the moment.
+  }
+
+  /**
+   * Receive notification of the beginning of a document.
+   *
+   * <p>The SAX parser will invoke this method only once, before any
+   * other methods in this interface or in DTDHandler (except for
+   * setDocumentLocator).</p>
+   */
+  public void startDocument() throws org.xml.sax.SAXException
+  {
+
+    // No action for the moment.
+  }
+
+  /**
+   * Receive notification of the end of a document.
+   *
+   * <p>The SAX parser will invoke this method only once, and it will
+   * be the last method invoked during the parse.  The parser shall
+   * not invoke this method until it has either abandoned parsing
+   * (because of an unrecoverable error) or reached the end of
+   * input.</p>
+   */
+  public void endDocument() throws org.xml.sax.SAXException
+  {
+
+    // No action for the moment.
+  }
+
+  /**
+   * Receive notification of the beginning of an element.
+   *
+   * <p>The Parser will invoke this method at the beginning of every
+   * element in the XML document; there will be a corresponding
+   * endElement() event for every startElement() event (even when the
+   * element is empty). All of the element's content will be
+   * reported, in order, before the corresponding endElement()
+   * event.</p>
+   *
+   * <p>If the element name has a namespace prefix, the prefix will
+   * still be attached.  Note that the attribute list provided will
+   * contain only attributes with explicit values (specified or
+   * defaulted): #IMPLIED attributes will be omitted.</p>
+   *
+   *
+   * @param ns The namespace of the node
+   * @param localName The local part of the qualified name
+   * @param name The element name.
+   * @param atts The attributes attached to the element, if any.
+   * @see #endElement
+   * @see org.xml.sax.Attributes
+   */
+  public void startElement(
+          String ns, String localName, String name, Attributes atts)
+            throws org.xml.sax.SAXException
+  {
+
+    Element elem;
+
+	// Note that the namespace-aware call must be used to correctly
+	// construct a Level 2 DOM, even for non-namespaced nodes.
+    if ((null == ns) || (ns.length() == 0))
+      elem = m_doc.createElementNS(null,name);
+    else
+      elem = m_doc.createElementNS(ns, name);
+
+    append(elem);
+
+    try
+    {
+      int nAtts = atts.getLength();
+
+      if (0 != nAtts)
+      {
+        for (int i = 0; i < nAtts; i++)
+        {
+
+          //System.out.println("type " + atts.getType(i) + " name " + atts.getLocalName(i) );
+          // First handle a possible ID attribute
+          if (atts.getType(i).equalsIgnoreCase("ID"))
+            setIDAttribute(atts.getValue(i), elem);
+
+          String attrNS = atts.getURI(i);
+
+          if("".equals(attrNS))
+            attrNS = null; // DOM represents no-namespace as null
+
+          // System.out.println("attrNS: "+attrNS+", localName: "+atts.getQName(i)
+          //                   +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i));
+          // Crimson won't let us set an xmlns: attribute on the DOM.
+          String attrQName = atts.getQName(i);
+
+          // In SAX, xmlns: attributes have an empty namespace, while in DOM they should have the xmlns namespace
+          if (attrQName.startsWith("xmlns:"))
+            attrNS = "http://www.w3.org/2000/xmlns/";
+
+          // ALWAYS use the DOM Level 2 call!
+          elem.setAttributeNS(attrNS,attrQName, atts.getValue(i));
+        }
+      }
+
+      // append(elem);
+
+      m_elemStack.push(elem);
+
+      m_currentNode = elem;
+
+      // append(elem);
+    }
+    catch(java.lang.Exception de)
+    {
+      // de.printStackTrace();
+      throw new org.xml.sax.SAXException(de);
+    }
+
+  }
+
+  /**
+
+
+
+   * Receive notification of the end of an element.
+   *
+   * <p>The SAX parser will invoke this method at the end of every
+   * element in the XML document; there will be a corresponding
+   * startElement() event for every endElement() event (even when the
+   * element is empty).</p>
+   *
+   * <p>If the element name has a namespace prefix, the prefix will
+   * still be attached to the name.</p>
+   *
+   *
+   * @param ns the namespace of the element
+   * @param localName The local part of the qualified name of the element
+   * @param name The element name
+   */
+  public void endElement(String ns, String localName, String name)
+          throws org.xml.sax.SAXException
+  {
+    m_elemStack.pop();
+    m_currentNode = m_elemStack.isEmpty() ? null : (Node)m_elemStack.peek();
+  }
+
+  /**
+   * Set an ID string to node association in the ID table.
+   *
+   * @param id The ID string.
+   * @param elem The associated ID.
+   */
+  public void setIDAttribute(String id, Element elem)
+  {
+
+    // Do nothing. This method is meant to be overiden.
+  }
+
+  /**
+   * Receive notification of character data.
+   *
+   * <p>The Parser will call this method to report each chunk of
+   * character data.  SAX parsers may return all contiguous character
+   * data in a single chunk, or they may split it into several
+   * chunks; however, all of the characters in any single event
+   * must come from the same external entity, so that the Locator
+   * provides useful information.</p>
+   *
+   * <p>The application must not attempt to read from the array
+   * outside of the specified range.</p>
+   *
+   * <p>Note that some parsers will report whitespace using the
+   * ignorableWhitespace() method rather than this one (validating
+   * parsers must do so).</p>
+   *
+   * @param ch The characters from the XML document.
+   * @param start The start position in the array.
+   * @param length The number of characters to read from the array.
+   * @see #ignorableWhitespace
+   * @see org.xml.sax.Locator
+   */
+  public void characters(char ch[], int start, int length) throws org.xml.sax.SAXException
+  {
+    if(isOutsideDocElem()
+       && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return;  // avoid DOM006 Hierarchy request error
+
+    if (m_inCData)
+    {
+      cdata(ch, start, length);
+
+      return;
+    }
+
+    String s = new String(ch, start, length);
+    Node childNode;
+    childNode =  m_currentNode != null ? m_currentNode.getLastChild(): null;
+    if( childNode != null && childNode.getNodeType() == Node.TEXT_NODE ){
+       ((Text)childNode).appendData(s);
+    }
+    else{
+       Text text = m_doc.createTextNode(s);
+       append(text);
+    }
+  }
+
+  /**
+   * If available, when the disable-output-escaping attribute is used,
+   * output raw text without escaping.  A PI will be inserted in front
+   * of the node with the name "lotusxsl-next-is-raw" and a value of
+   * "formatter-to-dom".
+   *
+   * @param ch Array containing the characters
+   * @param start Index to start of characters in the array
+   * @param length Number of characters in the array
+   */
+  public void charactersRaw(char ch[], int start, int length)
+          throws org.xml.sax.SAXException
+  {
+    if(isOutsideDocElem()
+       && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return;  // avoid DOM006 Hierarchy request error
+
+
+    String s = new String(ch, start, length);
+
+    append(m_doc.createProcessingInstruction("xslt-next-is-raw",
+                                             "formatter-to-dom"));
+    append(m_doc.createTextNode(s));
+  }
+
+  /**
+   * Report the beginning of an entity.
+   *
+   * The start and end of the document entity are not reported.
+   * The start and end of the external DTD subset are reported
+   * using the pseudo-name "[dtd]".  All other events must be
+   * properly nested within start/end entity events.
+   *
+   * @param name The name of the entity.  If it is a parameter
+   *        entity, the name will begin with '%'.
+   * @see #endEntity
+   * @see org.xml.sax.ext.DeclHandler#internalEntityDecl
+   * @see org.xml.sax.ext.DeclHandler#externalEntityDecl
+   */
+  public void startEntity(String name) throws org.xml.sax.SAXException
+  {
+
+    // Almost certainly the wrong behavior...
+    // entityReference(name);
+  }
+
+  /**
+   * Report the end of an entity.
+   *
+   * @param name The name of the entity that is ending.
+   * @see #startEntity
+   */
+  public void endEntity(String name) throws org.xml.sax.SAXException{}
+
+  /**
+   * Receive notivication of a entityReference.
+   *
+   * @param name name of the entity reference
+   */
+  public void entityReference(String name) throws org.xml.sax.SAXException
+  {
+    append(m_doc.createEntityReference(name));
+  }
+
+  /**
+   * Receive notification of ignorable whitespace in element content.
+   *
+   * <p>Validating Parsers must use this method to report each chunk
+   * of ignorable whitespace (see the W3C XML 1.0 recommendation,
+   * section 2.10): non-validating parsers may also use this method
+   * if they are capable of parsing and using content models.</p>
+   *
+   * <p>SAX parsers may return all contiguous whitespace in a single
+   * chunk, or they may split it into several chunks; however, all of
+   * the characters in any single event must come from the same
+   * external entity, so that the Locator provides useful
+   * information.</p>
+   *
+   * <p>The application must not attempt to read from the array
+   * outside of the specified range.</p>
+   *
+   * @param ch The characters from the XML document.
+   * @param start The start position in the array.
+   * @param length The number of characters to read from the array.
+   * @see #characters
+   */
+  public void ignorableWhitespace(char ch[], int start, int length)
+          throws org.xml.sax.SAXException
+  {
+    if(isOutsideDocElem())
+      return;  // avoid DOM006 Hierarchy request error
+
+    String s = new String(ch, start, length);
+
+    append(m_doc.createTextNode(s));
+  }
+
+  /**
+   * Tell if the current node is outside the document element.
+   *
+   * @return true if the current node is outside the document element.
+   */
+   private boolean isOutsideDocElem()
+   {
+      return (null == m_docFrag) && m_elemStack.size() == 0 && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE);
+   }
+
+  /**
+   * Receive notification of a processing instruction.
+   *
+   * <p>The Parser will invoke this method once for each processing
+   * instruction found: note that processing instructions may occur
+   * before or after the main document element.</p>
+   *
+   * <p>A SAX parser should never report an XML declaration (XML 1.0,
+   * section 2.8) or a text declaration (XML 1.0, section 4.3.1)
+   * using this method.</p>
+   *
+   * @param target The processing instruction target.
+   * @param data The processing instruction data, or null if
+   *        none was supplied.
+   */
+  public void processingInstruction(String target, String data)
+          throws org.xml.sax.SAXException
+  {
+    append(m_doc.createProcessingInstruction(target, data));
+  }
+
+  /**
+   * Report an XML comment anywhere in the document.
+   *
+   * This callback will be used for comments inside or outside the
+   * document element, including comments in the external DTD
+   * subset (if read).
+   *
+   * @param ch An array holding the characters in the comment.
+   * @param start The starting position in the array.
+   * @param length The number of characters to use from the array.
+   */
+  public void comment(char ch[], int start, int length) throws org.xml.sax.SAXException
+  {
+    append(m_doc.createComment(new String(ch, start, length)));
+  }
+
+  /** Flag indicating that we are processing a CData section          */
+  protected boolean m_inCData = false;
+
+  /**
+   * Report the start of a CDATA section.
+   *
+   * @see #endCDATA
+   */
+  public void startCDATA() throws org.xml.sax.SAXException
+  {
+    m_inCData = true;
+    append(m_doc.createCDATASection(""));
+  }
+
+  /**
+   * Report the end of a CDATA section.
+   *
+   * @see #startCDATA
+   */
+  public void endCDATA() throws org.xml.sax.SAXException
+  {
+    m_inCData = false;
+  }
+
+  /**
+   * Receive notification of cdata.
+   *
+   * <p>The Parser will call this method to report each chunk of
+   * character data.  SAX parsers may return all contiguous character
+   * data in a single chunk, or they may split it into several
+   * chunks; however, all of the characters in any single event
+   * must come from the same external entity, so that the Locator
+   * provides useful information.</p>
+   *
+   * <p>The application must not attempt to read from the array
+   * outside of the specified range.</p>
+   *
+   * <p>Note that some parsers will report whitespace using the
+   * ignorableWhitespace() method rather than this one (validating
+   * parsers must do so).</p>
+   *
+   * @param ch The characters from the XML document.
+   * @param start The start position in the array.
+   * @param length The number of characters to read from the array.
+   * @see #ignorableWhitespace
+   * @see org.xml.sax.Locator
+   */
+  public void cdata(char ch[], int start, int length) throws org.xml.sax.SAXException
+  {
+    if(isOutsideDocElem()
+       && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return;  // avoid DOM006 Hierarchy request error
+
+    String s = new String(ch, start, length);
+
+    // XXX ab@apache.org: modified from the original, to accomodate TagSoup. 
+    Node n = m_currentNode.getLastChild();
+    if (n instanceof CDATASection)
+      ((CDATASection)n).appendData(s);
+    else if (n instanceof Comment)
+      ((Comment)n).appendData(s);
+  }
+
+  /**
+   * Report the start of DTD declarations, if any.
+   *
+   * Any declarations are assumed to be in the internal subset
+   * unless otherwise indicated.
+   *
+   * @param name The document type name.
+   * @param publicId The declared public identifier for the
+   *        external DTD subset, or null if none was declared.
+   * @param systemId The declared system identifier for the
+   *        external DTD subset, or null if none was declared.
+   * @see #endDTD
+   * @see #startEntity
+   */
+  public void startDTD(String name, String publicId, String systemId)
+          throws org.xml.sax.SAXException
+  {
+
+    // Do nothing for now.
+  }
+
+  /**
+   * Report the end of DTD declarations.
+   *
+   * @see #startDTD
+   */
+  public void endDTD() throws org.xml.sax.SAXException
+  {
+
+    // Do nothing for now.
+  }
+
+  /**
+   * Begin the scope of a prefix-URI Namespace mapping.
+   *
+   * <p>The information from this event is not necessary for
+   * normal Namespace processing: the SAX XML reader will
+   * automatically replace prefixes for element and attribute
+   * names when the http://xml.org/sax/features/namespaces
+   * feature is true (the default).</p>
+   *
+   * <p>There are cases, however, when applications need to
+   * use prefixes in character data or in attribute values,
+   * where they cannot safely be expanded automatically; the
+   * start/endPrefixMapping event supplies the information
+   * to the application to expand prefixes in those contexts
+   * itself, if necessary.</p>
+   *
+   * <p>Note that start/endPrefixMapping events are not
+   * guaranteed to be properly nested relative to each-other:
+   * all startPrefixMapping events will occur before the
+   * corresponding startElement event, and all endPrefixMapping
+   * events will occur after the corresponding endElement event,
+   * but their order is not guaranteed.</p>
+   *
+   * @param prefix The Namespace prefix being declared.
+   * @param uri The Namespace URI the prefix is mapped to.
+   * @see #endPrefixMapping
+   * @see #startElement
+   */
+  public void startPrefixMapping(String prefix, String uri)
+          throws org.xml.sax.SAXException
+  {
+
+    /*
+    // Not sure if this is needed or wanted
+    // Also, it fails in the stree.
+    if((null != m_currentNode)
+       && (m_currentNode.getNodeType() == Node.ELEMENT_NODE))
+    {
+      String qname;
+      if(((null != prefix) && (prefix.length() == 0))
+         || (null == prefix))
+        qname = "xmlns";
+      else
+        qname = "xmlns:"+prefix;
+
+      Element elem = (Element)m_currentNode;
+      String val = elem.getAttribute(qname); // Obsolete, should be DOM2...?
+      if(val == null)
+      {
+        elem.setAttributeNS("http://www.w3.org/XML/1998/namespace",
+                            qname, uri);
+      }
+    }
+    */
+  }
+
+  /**
+   * End the scope of a prefix-URI mapping.
+   *
+   * <p>See startPrefixMapping for details.  This event will
+   * always occur after the corresponding endElement event,
+   * but the order of endPrefixMapping events is not otherwise
+   * guaranteed.</p>
+   *
+   * @param prefix The prefix that was being mapping.
+   * @see #startPrefixMapping
+   * @see #endElement
+   */
+  public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException{}
+
+  /**
+   * Receive notification of a skipped entity.
+   *
+   * <p>The Parser will invoke this method once for each entity
+   * skipped.  Non-validating processors may skip entities if they
+   * have not seen the declarations (because, for example, the
+   * entity was declared in an external DTD subset).  All processors
+   * may skip external entities, depending on the values of the
+   * http://xml.org/sax/features/external-general-entities and the
+   * http://xml.org/sax/features/external-parameter-entities
+   * properties.</p>
+   *
+   * @param name The name of the skipped entity.  If it is a
+   *        parameter entity, the name will begin with '%'.
+   */
+  public void skippedEntity(String name) throws org.xml.sax.SAXException{}
+}

Propchange: incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java (added)
+++ incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java Wed Jun  1 15:20:01 2005
@@ -0,0 +1,186 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import java.net.URL;
+import java.util.Properties;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.w3c.dom.*;
+
+/**
+ * Class for parsing META Directives from DOM trees.  This class
+ * handles specifically Robots META directives (all, none, nofollow,
+ * noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache
+ * instructions. All meta directives are stored in a HTMLMetaTags instance.
+ */
+public class HTMLMetaProcessor {
+
+  /**
+   * Utility class with indicators for the robots directives "noindex"
+   * and "nofollow", and HTTP-EQUIV/no-cache
+   */
+  
+  /**
+   * Sets the indicators in <code>robotsMeta</code> to appropriate
+   * values, based on any META tags found under the given
+   * <code>node</code>.
+   */
+  public static final void getMetaTags (
+    HTMLMetaTags metaTags, Node node, URL currURL) {
+
+    metaTags.reset();
+    getMetaTagsHelper(metaTags, node, currURL);
+  }
+
+  private static final void getMetaTagsHelper(
+    HTMLMetaTags metaTags, Node node, URL currURL) {
+
+    if (node.getNodeType() == Node.ELEMENT_NODE) {
+
+      if ("body".equalsIgnoreCase(node.getNodeName())) {
+        // META tags should not be under body
+        return;
+      }
+
+      if ("meta".equalsIgnoreCase(node.getNodeName())) {
+        NamedNodeMap attrs = node.getAttributes();
+        Node nameNode = attrs.getNamedItem("name");
+        Node equivNode = attrs.getNamedItem("http-equiv");
+        Node contentNode = attrs.getNamedItem("content");
+
+        if (nameNode != null) {
+          if (contentNode != null) {
+            String name = nameNode.getNodeValue().toLowerCase();
+            metaTags.getGeneralTags().setProperty(name, contentNode.getNodeValue());
+            if ("robots".equals(name)) {
+  
+              if (contentNode != null) {
+                String directives = 
+                  contentNode.getNodeValue().toLowerCase();
+                int index = directives.indexOf("none");
+  
+                if (index >= 0) {
+                  metaTags.setNoIndex();
+                  metaTags.setNoFollow();
+                }
+  
+                index = directives.indexOf("all");
+                if (index >= 0) {
+                  // do nothing...
+                }
+  
+                index = directives.indexOf("noindex");
+                if (index >= 0) {
+                  metaTags.setNoIndex();
+                }
+  
+                index = directives.indexOf("nofollow");
+                if (index >= 0) {
+                  metaTags.setNoFollow();
+                }
+              } 
+  
+            } // end if (name == robots)
+          }
+        }
+
+        if (equivNode != null) {
+          if (contentNode != null) {
+            String name = equivNode.getNodeValue().toLowerCase();
+            String content = contentNode.getNodeValue();
+            metaTags.getHttpEquivTags().setProperty(name, content);
+            if ("pragma".equals(name)) {
+              content = content.toLowerCase();
+              int index = content.indexOf("no-cache");
+              if (index >= 0) 
+                metaTags.setNoCache();
+            } else if ("refresh".equals(name)) {
+              int idx = content.indexOf(';');
+              String time = null;
+              if (idx == -1) { // just the refresh time
+                time = content;
+              } else time = content.substring(0, idx);
+              try {
+                metaTags.setRefreshTime(Integer.parseInt(time));
+                // skip this if we couldn't parse the time
+                metaTags.setRefresh(true);
+              } catch (Exception e) {
+                ;
+              }
+              if (metaTags.getRefresh() && idx != -1) { // set the URL
+                idx = content.toLowerCase().indexOf("url=");
+                if (idx == -1) { // assume a mis-formatted entry with just the url
+                  idx = content.indexOf(';') + 1;
+                } else idx += 4;
+                if (idx != -1) {
+                  String url = content.substring(idx);
+                  URL refreshUrl = null;
+                  try {
+                    refreshUrl = new URL(url);
+                  } catch (Exception e) {
+                    // this has to be an absolute url!
+                    if (!url.startsWith("/")) url = "/" + url;
+                    try {
+                      refreshUrl = new URL(currURL, url);
+                    } catch (Exception e1) {
+                      ;
+                    }
+                  }
+                  if (refreshUrl == null) metaTags.setRefresh(false);
+                  metaTags.setRefreshHref(refreshUrl);
+                }
+              }
+            }
+          }
+        }
+
+      } else if ("base".equalsIgnoreCase(node.getNodeName())) {
+        NamedNodeMap attrs = node.getAttributes();
+        Node hrefNode = attrs.getNamedItem("href");
+
+        if (hrefNode != null) {
+          String urlString = hrefNode.getNodeValue();
+
+          URL url = null;
+          try {
+            if (currURL == null)
+              url = new URL(urlString);
+            else 
+              url = new URL(currURL, urlString);
+          } catch (Exception e) {
+            ;
+          }
+
+          if (url != null) 
+            metaTags.setBaseHref(url);
+        }
+
+      }
+
+    }
+
+    NodeList children = node.getChildNodes();
+    if (children != null) {
+      int len = children.getLength();
+      for (int i = 0; i < len; i++) {
+        getMetaTagsHelper(metaTags, children.item(i), currURL);
+      }
+    }
+  }
+
+}

Propchange: incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Wed Jun  1 15:20:01 2005
@@ -28,14 +28,11 @@
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 import org.w3c.dom.*;
-import org.w3c.dom.html.*;
 import org.apache.html.dom.*;
 
-import org.apache.nutch.fetcher.FetcherOutput;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.*;
 import org.apache.nutch.parse.*;
-import org.apache.nutch.parse.html.RobotsMetaProcessor.*;
 
 
 public class HtmlParser implements Parser {
@@ -52,6 +49,8 @@
   private static Pattern charsetPattern =
     Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)",
                     Pattern.CASE_INSENSITIVE);
+  
+  private static String parserImpl = NutchConf.get().get("parser.html.impl", "neko");
 
   /**
    * Given a <code>byte[]</code> representing an html file of an 
@@ -94,22 +93,14 @@
   private static String defaultCharEncoding =
     NutchConf.get().get("parser.character.encoding.default", "windows-1252");
 
-  public Parse getParse(Content content) throws ParseException {
-    DOMParser parser = new DOMParser();
-    
-    // some plugins, e.g., creativecommons, need to examine html comments
-    try {
-      parser.setFeature("http://apache.org/xml/features/include-comments", 
-                        true);
-    } catch (SAXException e) {}
-
-    RobotsMetaIndicator robotsMeta = new RobotsMetaIndicator();
+  public Parse getParse(Content content) {
+    HTMLMetaTags metaTags = new HTMLMetaTags();
 
     URL base;
     try {
       base = new URL(content.getBaseUrl());
     } catch (MalformedURLException e) {
-      throw new ParseException(e);
+      return new ParseStatus(e).getEmptyParse();
     }
 
     String text = "";
@@ -120,19 +111,18 @@
     // check that contentType is one we can handle
     String contentType = content.getContentType();
     if (!"".equals(contentType) && !contentType.startsWith("text/html"))
-      throw new ParseException("Content-Type not text/html: " + contentType);
+      return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
+              "Content-Type not text/html: " + contentType).getEmptyParse();
     
     // parse the content
     DocumentFragment root;
     try {
       byte[] contentInOctets = content.getContent();
-      InputSource input =
-        new InputSource(new ByteArrayInputStream(contentInOctets));
+      InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
       String encoding = StringUtil.parseCharacterEncoding(contentType);
       if (encoding!=null) {
         metadata.put("OriginalCharEncoding", encoding);
         if ((encoding = StringUtil.resolveEncodingAlias(encoding)) != null) {
-	  input.setEncoding(encoding); 
           metadata.put("CharEncodingForConversion", encoding);
           LOG.fine(base + ": setting encoding to " + encoding);
         }
@@ -144,7 +134,6 @@
         if (encoding!=null) {
           metadata.put("OriginalCharEncoding", encoding);
           if ((encoding = StringUtil.resolveEncodingAlias(encoding)) != null) {
-	    input.setEncoding(encoding); 
             metadata.put("CharEncodingForConversion", encoding);
             LOG.fine(base + ": setting encoding to " + encoding);
           }
@@ -158,33 +147,29 @@
         // (e.g. se: windows-1252, kr: x-windows-949, cn: gb18030, tw: big5
         // doesn't work for jp because euc-jp and shift_jis have about the
         // same share)
-       
+        encoding = defaultCharEncoding;
         metadata.put("CharEncodingForConversion", defaultCharEncoding);
-        input.setEncoding(defaultCharEncoding);
         LOG.fine(base + ": falling back to " + defaultCharEncoding);
       }
-
+      input.setEncoding(encoding);
       LOG.fine("Parsing...");
-      parser.parse(input);
-
-      // convert Document to DocumentFragment
-      HTMLDocumentImpl doc = (HTMLDocumentImpl)parser.getDocument();
-      doc.setErrorChecking(false);
-      root = doc.createDocumentFragment();
-      root.appendChild(doc.getDocumentElement());
+      root = parse(input);
     } catch (IOException e) {
-      throw new ParseException(e);
+      return new ParseStatus(e).getEmptyParse();
     } catch (DOMException e) {
-      throw new ParseException(e);
+      return new ParseStatus(e).getEmptyParse();
     } catch (SAXException e) {
-      throw new ParseException(e);
+      return new ParseStatus(e).getEmptyParse();
+    } catch (Exception e) {
+      e.printStackTrace();
+      return new ParseStatus(e).getEmptyParse();
     }
       
     // get meta directives
-    RobotsMetaProcessor.getRobotsMetaDirectives(robotsMeta, root, base);
-      
+    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
+    LOG.info("Meta tags for " + base + ": " + metaTags.toString());
     // check meta directives
-    if (!robotsMeta.getNoIndex()) {               // okay to index
+    if (!metaTags.getNoIndex()) {               // okay to index
       StringBuffer sb = new StringBuffer();
       LOG.fine("Getting text...");
       DOMContentUtils.getText(sb, root);          // extract text
@@ -195,7 +180,7 @@
       title = sb.toString().trim();
     }
       
-    if (!robotsMeta.getNoFollow()) {              // okay to follow links
+    if (!metaTags.getNoFollow()) {              // okay to follow links
       ArrayList l = new ArrayList();              // extract outlinks
       URL baseTag = DOMContentUtils.getBase(root);
       LOG.fine("Getting links...");
@@ -204,20 +189,78 @@
       LOG.fine("found "+outlinks.length+" outlinks in "+content.getUrl());
     }
     
-    if (!robotsMeta.getNoCache()) {             // okay to cache
+    if (!metaTags.getNoCache()) {             // okay to cache
       // ??? FIXME ???
     }
     
     // copy content metadata through
     metadata.putAll(content.getMetadata());
-
-    ParseData parseData = new ParseData(title, outlinks, metadata);
+    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
+    if (metaTags.getRefresh()) {
+      status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
+      status.setMessage(metaTags.getRefreshHref().toString());
+    }
+    ParseData parseData = new ParseData(status, title, outlinks, metadata);
     Parse parse = new ParseImpl(text, parseData);
 
     // run filters on parse
-    return HtmlParseFilters.filter(content, parse, root);
+    return HtmlParseFilters.filter(content, parse, metaTags, root);
   }
 
+  private DocumentFragment parse(InputSource input) throws Exception {
+    if (parserImpl.equalsIgnoreCase("tagsoup"))
+      return parseTagSoup(input);
+    else return parseNeko(input);
+  }
+  
+  private DocumentFragment parseTagSoup(InputSource input) throws Exception {
+    HTMLDocumentImpl doc = new HTMLDocumentImpl();
+    DocumentFragment frag = doc.createDocumentFragment();
+    DOMBuilder builder = new DOMBuilder(doc, frag);
+    org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser();
+    reader.setContentHandler(builder);
+    reader.setFeature(reader.ignoreBogonsFeature, true);
+    reader.setFeature(reader.bogonsEmptyFeature, false);
+    reader.setProperty("http://xml.org/sax/properties/lexical-handler", builder);
+    reader.parse(input);
+    return frag;
+  }
+  
+  private DocumentFragment parseNeko(InputSource input) throws Exception {
+    DOMFragmentParser parser = new DOMFragmentParser();
+    // some plugins, e.g., creativecommons, need to examine html comments
+    try {
+      parser.setFeature("http://apache.org/xml/features/include-comments", 
+              true);
+      parser.setFeature("http://apache.org/xml/features/augmentations", 
+              true);
+      parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
+              false);
+      parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment",
+              true);
+      parser.setFeature("http://cyberneko.org/html/features/report-errors",
+              true);
+    } catch (SAXException e) {}
+    // convert Document to DocumentFragment
+    HTMLDocumentImpl doc = new HTMLDocumentImpl();
+    doc.setErrorChecking(false);
+    DocumentFragment res = doc.createDocumentFragment();
+    DocumentFragment frag = doc.createDocumentFragment();
+    parser.parse(input, frag);
+    res.appendChild(frag);
+    
+    try {
+      while(true) {
+        frag = doc.createDocumentFragment();
+        parser.parse(input, frag);
+        if (!frag.hasChildNodes()) break;
+        LOG.info(" - new frag, " + frag.getChildNodes().getLength() + " nodes.");
+        res.appendChild(frag);
+      }
+    } catch (Exception x) { x.printStackTrace();};
+    return res;
+  }
+  
   public static void main(String[] args) throws Exception {
     LOG.setLevel(Level.FINE);
     String name = args[0];

Added: incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java (added)
+++ incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java Wed Jun  1 15:20:01 2005
@@ -0,0 +1,113 @@
+/*
+ * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
+ * XXX distribution, org.apache.xml.utils.XMLCharacterRecognizer,
+ * XXX in order to avoid dependency on Xalan.
+ */
+
+/*
+ * Copyright 1999-2004 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Id: XMLCharacterRecognizer.java,v 1.7 2004/02/17 04:21:14 minchau Exp $
+ */
+package org.apache.nutch.parse.html;
+
+/**
+ * Class used to verify whether the specified <var>ch</var> 
+ * conforms to the XML 1.0 definition of whitespace. 
+ * @xsl.usage internal
+ */
+public class XMLCharacterRecognizer
+{
+
+  /**
+   * Returns whether the specified <var>ch</var> conforms to the XML 1.0 definition
+   * of whitespace.  Refer to <A href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S">
+   * the definition of <CODE>S</CODE></A> for details.
+   * @param ch Character to check as XML whitespace.
+   * @return =true if <var>ch</var> is XML whitespace; otherwise =false.
+   */
+  public static boolean isWhiteSpace(char ch)
+  {
+    return (ch == 0x20) || (ch == 0x09) || (ch == 0xD) || (ch == 0xA);
+  }
+
+  /**
+   * Tell if the string is whitespace.
+   *
+   * @param ch Character array to check as XML whitespace.
+   * @param start Start index of characters in the array
+   * @param length Number of characters in the array 
+   * @return True if the characters in the array are 
+   * XML whitespace; otherwise, false.
+   */
+  public static boolean isWhiteSpace(char ch[], int start, int length)
+  {
+
+    int end = start + length;
+
+    for (int s = start; s < end; s++)
+    {
+      if (!isWhiteSpace(ch[s]))
+        return false;
+    }
+
+    return true;
+  }
+
+  /**
+   * Tell if the string is whitespace.
+   *
+   * @param buf StringBuffer to check as XML whitespace.
+   * @return True if characters in buffer are XML whitespace, false otherwise
+   */
+  public static boolean isWhiteSpace(StringBuffer buf)
+  {
+
+    int n = buf.length();
+
+    for (int i = 0; i < n; i++)
+    {
+      if (!isWhiteSpace(buf.charAt(i)))
+        return false;
+    }
+
+    return true;
+  }
+  
+  /**
+   * Tell if the string is whitespace.
+   *
+   * @param buf StringBuffer to check as XML whitespace.
+   * @return True if characters in buffer are XML whitespace, false otherwise
+   */
+  public static boolean isWhiteSpace(String s)
+  {
+
+    if(null != s)
+    {
+      int n = s.length();
+  
+      for (int i = 0; i < n; i++)
+      {
+        if (!isWhiteSpace(s.charAt(i)))
+          return false;
+      }
+    }
+
+    return true;
+  }
+
+}

Propchange: incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: incubator/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ incubator/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Wed Jun  1 15:20:01 2005
@@ -205,6 +205,7 @@
            new Outlink("http://www.nutch.org/frames/right.html", ""),
          },
          {
+           new Outlink("http://www.nutch.org/maps/logo.gif", ""),
            new Outlink("http://www.nutch.org/index.html", ""),
            new Outlink("http://www.nutch.org/maps/#bottom", ""),
            new Outlink("http://www.nutch.org/bot.html", ""),

Modified: incubator/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java (original)
+++ incubator/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java Wed Jun  1 15:20:01 2005
@@ -18,7 +18,8 @@
 
 import junit.framework.TestCase;
 
-import org.apache.nutch.parse.html.RobotsMetaProcessor.*;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.html.HTMLMetaProcessor.*;
 
 import java.io.ByteArrayInputStream;
 import java.net.URL;
@@ -28,7 +29,7 @@
 import org.w3c.dom.*;
 import org.apache.html.dom.*;
 
-/** Unit tests for RobotsMetaProcessor. */
+/** Unit tests for HTMLMetaProcessor. */
 public class TestRobotsMetaProcessor extends TestCase {
   public TestRobotsMetaProcessor(String name) { 
     super(name); 
@@ -157,8 +158,8 @@
         e.printStackTrace();
       }
 
-      RobotsMetaIndicator robotsMeta= new RobotsMetaIndicator();
-      RobotsMetaProcessor.getRobotsMetaDirectives(robotsMeta, node, 
+      HTMLMetaTags robotsMeta= new HTMLMetaTags();
+      HTMLMetaProcessor.getMetaTags(robotsMeta, node, 
                                                   currURLsAndAnswers[i][0]);
 
       assertTrue("got index wrong on test " + i,

Added: incubator/nutch/trunk/src/plugin/parse-js/build.xml
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-js/build.xml?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-js/build.xml (added)
+++ incubator/nutch/trunk/src/plugin/parse-js/build.xml Wed Jun  1 15:20:01 2005
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+
+<project name="parse-js" default="jar">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

Propchange: incubator/nutch/trunk/src/plugin/parse-js/build.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/nutch/trunk/src/plugin/parse-js/plugin.xml
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-js/plugin.xml?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-js/plugin.xml (added)
+++ incubator/nutch/trunk/src/plugin/parse-js/plugin.xml Wed Jun  1 15:20:01 2005
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="parse-js"
+   name="JavaScript Parser"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <extension-point
+      id="org.apache.nutch.parse.HtmlParseFilter"
+      name="HTML Parse Filter"/>
+
+   <runtime>
+      <library name="parse-js.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="org.apache.nutch.parse.js.JSParseFilter"
+              name="Parse JS Filter"
+              point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="JSParseFilter"
+	      class="org.apache.nutch.parse.js.JSParseFilter"
+	      contentType="application/x-javascript"
+	      pathSuffix=""/>
+   </extension>
+
+</plugin>

Propchange: incubator/nutch/trunk/src/plugin/parse-js/plugin.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html (added)
+++ incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html Wed Jun  1 15:20:01 2005
@@ -0,0 +1,6 @@
+<html>
+<body>
+<p>A parser plugin and content filter to extract all (possible) links
+from JavaScript files and code snippets.</p>
+</body>
+</html>

Propchange: incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (added)
+++ incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Wed Jun  1 15:20:01 2005
@@ -0,0 +1,226 @@
+package org.apache.nutch.parse.js;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Properties;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.LogFormatter;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.Pattern;
+import org.apache.oro.text.regex.PatternCompiler;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.PatternMatcherInput;
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+/**
+ * This class is a heuristic link extractor for JavaScript files and
+ * code snippets. The general idea of a two-pass regex matching comes from
+ * Heritrix. Parts of the code come from OutlinkExtractor.java
+ * by Stephan Strittmatter.
+ *
+ * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
+ */
+public class JSParseFilter implements HtmlParseFilter, Parser {
+  public static final Logger LOG =
+    LogFormatter.getLogger("org.apache.nutch.parse.js.JSParseFilter");
+
+  private static final int MAX_TITLE_LEN = 80;
+  
+  public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
+    String url = content.getBaseUrl();
+    ArrayList outlinks = new ArrayList();
+    walk(doc, parse, metaTags, url, outlinks);
+    if (outlinks.size() > 0) {
+      Outlink[] old = parse.getData().getOutlinks();
+      Properties metadata = parse.getData().getMetadata();
+      String title = parse.getData().getTitle();
+      List list = Arrays.asList(old);
+      outlinks.addAll(list);
+      ParseStatus status = parse.getData().getStatus();
+      String text = parse.getText();
+      Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
+      parse = new ParseImpl(text, new ParseData(status, title, newlinks, metadata));
+    }
+    return parse;
+  }
+  
+  private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, List outlinks) {
+    if (n instanceof Element) {
+      String name = n.getNodeName();
+      if (name.equalsIgnoreCase("script")) {
+        String lang = null;
+        Node lNode = n.getAttributes().getNamedItem("language");
+        if (lNode == null) lang = "javascript";
+        else lang = lNode.getNodeValue();
+        StringBuffer script = new StringBuffer();
+        NodeList nn = n.getChildNodes();
+        if (nn.getLength() > 0) {
+          for (int i = 0; i < nn.getLength(); i++) {
+            if (i > 0) script.append('\n');
+            script.append(nn.item(i).getNodeValue());
+          }
+          //LOG.info("script: language=" + lang + ", text: " + script.toString());
+          Outlink[] links = getJSLinks(script.toString(), base, base);
+          if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
+          // no other children of interest here, go one level up.
+          return;
+        }
+      } else {
+        // process all HTML 4.0 events, if present...
+        NamedNodeMap attrs = n.getAttributes();
+        int len = attrs.getLength();
+        for (int i = 0; i < len; i++) {
+          // Window: onload,onunload
+          // Form: onchange,onsubmit,onreset,onselect,onblur,onfocus
+          // Keyboard: onkeydown,onkeypress,onkeyup
+          // Mouse: onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
+          Node anode = attrs.item(i);
+          if (anode.getNodeName().startsWith("on")) {
+            Outlink[] links = getJSLinks(anode.getNodeValue(), base, base);
+            if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
+          }
+        }
+      }
+    }
+    NodeList nl = n.getChildNodes();
+    for (int i = 0; i < nl.getLength(); i++) {
+      walk(nl.item(i), parse, metaTags, base, outlinks);
+    }
+  }
+  
+  public Parse getParse(Content c) {
+    String type = c.getContentType();
+    if (type != null && !type.toLowerCase().startsWith("application/x-javascript"))
+      return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
+              "Content not JavaScript: " + type).getEmptyParse();
+    String script = new String(c.getContent());
+    Outlink[] outlinks = getJSLinks(script, c.getUrl(), c.getUrl());
+    if (outlinks == null) outlinks = new Outlink[0];
+    // Title? use the first line of the script...
+    String title;
+    int idx = script.indexOf('\n');
+    if (idx != -1) {
+      if (idx > MAX_TITLE_LEN) idx = MAX_TITLE_LEN;
+      title = script.substring(0, idx);
+    } else {
+      idx = Math.min(MAX_TITLE_LEN, script.length());
+      title = script.substring(0, idx);
+    }
+    Properties metadata = new Properties();
+    metadata.putAll(c.getMetadata());
+    ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title,
+            outlinks, metadata);
+    Parse parse = new ParseImpl(script, pd);
+    return parse;
+  }
+  
+  private static final String STRING_PATTERN = "(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)";
+  // A simple pattern. This allows also invalid URL characters.
+  private static final String URI_PATTERN = "(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)";
+  // Alternative pattern, which limits valid url characters.
+  //private static final String URI_PATTERN = "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)";
+  
+  /**
+   *  This method extracts URLs from literals embedded in JavaScript.
+   */
+  private static Outlink[] getJSLinks(String plainText, String anchor, String base) {
+
+    final List outlinks = new ArrayList();
+    URL baseURL = null;
+    
+    try {
+      baseURL = new URL(base);
+    } catch (Exception e) {
+      LOG.throwing(JSParseFilter.class.getName(), "getJSLinks", e);
+    }
+
+    try {
+      final PatternCompiler cp = new Perl5Compiler();
+      final Pattern pattern = cp.compile(STRING_PATTERN,
+          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+              | Perl5Compiler.MULTILINE_MASK);
+      final Pattern pattern1 = cp.compile(URI_PATTERN,
+              Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+                  | Perl5Compiler.MULTILINE_MASK);
+      final PatternMatcher matcher = new Perl5Matcher();
+
+      final PatternMatcher matcher1 = new Perl5Matcher();
+      final PatternMatcherInput input = new PatternMatcherInput(plainText);
+
+      MatchResult result;
+      String url;
+
+      //loop the matches
+      while (matcher.contains(input, pattern)) {
+        result = matcher.getMatch();
+        url = result.group(2);
+        PatternMatcherInput input1 = new PatternMatcherInput(url);
+        if (!matcher1.matches(input1, pattern1)) {
+          //LOG.fine(" - invalid '" + url + "'");
+          continue;
+        }
+        if (url.startsWith("www.")) {
+            url = "http://" + url;
+        } else url = new URL(baseURL, url).toString();
+        url = url.replaceAll("&amp;", "&");
+        LOG.fine(" - outlink from JS: '" + url + "'");
+        outlinks.add(new Outlink(url, anchor));
+      }
+    } catch (Exception ex) {
+      // if it is a malformed URL we just throw it away and continue with
+      // extraction.
+      LOG.throwing(JSParseFilter.class.getName(), "getJSLinks", ex);
+    }
+
+    final Outlink[] retval;
+
+    //create array of the Outlinks
+    if (outlinks != null && outlinks.size() > 0) {
+      retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
+    } else {
+      retval = new Outlink[0];
+    }
+
+    return retval;
+  }
+  
+  public static void main(String[] args) throws Exception {
+    if (args.length < 2) {
+      System.err.println(JSParseFilter.class.getName() + " file.js baseURL");
+      return;
+    }
+    InputStream in = new FileInputStream(args[0]);
+    BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
+    StringBuffer sb = new StringBuffer();
+    String line = null;
+    while ((line = br.readLine()) != null) sb.append(line + "\n");
+    Outlink[] links = getJSLinks(sb.toString(), args[1], args[1]);
+    System.out.println("Outlinks extracted: " + links.length);
+    for (int i = 0; i < links.length; i++)
+      System.out.println(" - " + links[i]);
+  }
+}

Propchange: incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: incubator/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java (original)
+++ incubator/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java Wed Jun  1 15:20:01 2005
@@ -18,6 +18,7 @@
 
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
@@ -52,13 +53,13 @@
 
   public MSWordParser () {}
 
-  public Parse getParse(Content content) throws ParseException {
+  public Parse getParse(Content content) {
 
     // check that contentType is one we can handle
     String contentType = content.getContentType();
     if (contentType != null && !contentType.startsWith("application/msword"))
-      throw new ParseException(
-        "Content-Type not application/msword: "+contentType);
+      return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
+        "Content-Type not application/msword: " + contentType).getEmptyParse();
 
     String text = null;
     String title = null;
@@ -71,8 +72,9 @@
       String contentLength = content.get("Content-Length");
       if (contentLength != null
             && raw.length != Integer.parseInt(contentLength)) {
-          throw new ParseException("Content truncated at "+raw.length
-            +" bytes. Parser can't handle incomplete msword file.");
+          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
+                  "Content truncated at " + raw.length
+            +" bytes. Parser can't handle incomplete msword file.").getEmptyParse();
       }
 
       WordExtractor extractor = new WordExtractor();
@@ -86,13 +88,14 @@
       extractor = null;
 
     } catch (ParseException e) {
-      throw e;
+      return new ParseStatus(e).getEmptyParse();
     } catch (FastSavedException e) {
-      throw new ParseException(e);
+      return new ParseStatus(e).getEmptyParse();
     } catch (PasswordProtectedException e) {
-      throw new ParseException(e);
+      return new ParseStatus(e).getEmptyParse();
     } catch (Exception e) { // run time exception
-      throw new ParseException("Can't be handled as msword document. "+e);
+      return new ParseStatus(ParseStatus.FAILED,
+              "Can't be handled as msword document. " + e).getEmptyParse();
     } finally {
       // nothing so far
     }
@@ -116,7 +119,7 @@
     // collect outlink
     Outlink[] outlinks = new Outlink[0];
 
-    ParseData parseData = new ParseData(title, outlinks, metadata);
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata);
     return new ParseImpl(text, parseData);
     // any filter?
     //return HtmlParseFilters.filter(content, parse, root);

Modified: incubator/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java (original)
+++ incubator/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java Wed Jun  1 15:20:01 2005
@@ -64,7 +64,7 @@
       urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
 
       protocol = ProtocolFactory.getProtocol(urlString);
-      content = protocol.getContent(urlString);
+      content = protocol.getProtocolOutput(urlString).getContent();
 
       parser = ParserFactory.getParser(content.getContentType(), urlString);
       parse = parser.getParse(content);

Modified: incubator/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java (original)
+++ incubator/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java Wed Jun  1 15:20:01 2005
@@ -27,6 +27,7 @@
 
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
@@ -79,13 +80,13 @@
     rootLogger.addAppender(appender);
   }
 
-  public Parse getParse(Content content) throws ParseException {
+  public Parse getParse(Content content) {
 
     // check that contentType is one we can handle
     String contentType = content.getContentType();
     if (contentType != null && !contentType.startsWith("application/pdf"))
-      throw new ParseException(
-        "Content-Type not application/pdf: "+contentType);
+      return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
+        "Content-Type not application/pdf: " + contentType).getEmptyParse();
 
     // in memory representation of pdf file
     PDDocument pdf = null;
@@ -100,8 +101,9 @@
       String contentLength = content.get("Content-Length");
       if (contentLength != null
             && raw.length != Integer.parseInt(contentLength)) {
-          throw new ParseException("Content truncated at "+raw.length
-            +" bytes. Parser can't handle incomplete pdf file.");
+          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
+                  "Content truncated at "+raw.length
+            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
       }
 
       PDFParser parser = new PDFParser(
@@ -134,14 +136,15 @@
       // formatDate(info.getCreationDate())
       // formatDate(info.getModificationDate())
 
-    } catch (ParseException e) {
-      throw e;
     } catch (CryptographyException e) {
-      throw new ParseException("Error decrypting document. "+e);
+      return new ParseStatus(ParseStatus.FAILED,
+              "Error decrypting document. " + e).getEmptyParse();
     } catch (InvalidPasswordException e) {
-      throw new ParseException("Can't decrypt document. "+e);
+      return new ParseStatus(ParseStatus.FAILED,
+              "Can't decrypt document - invalid password. " + e).getEmptyParse();
     } catch (Exception e) { // run time exception
-      throw new ParseException("Can't be handled as pdf document. "+e);
+      return new ParseStatus(ParseStatus.FAILED,
+              "Can't be handled as pdf document. " + e).getEmptyParse();
     } finally {
       try {
         if (pdf != null)
@@ -164,7 +167,7 @@
     Properties metadata = new Properties();
     metadata.putAll(content.getMetadata()); // copy through
 
-    ParseData parseData = new ParseData(title, outlinks, metadata);
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata);
     return new ParseImpl(text, parseData);
     // any filter?
     //return HtmlParseFilters.filter(content, parse, root);

Modified: incubator/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java (original)
+++ incubator/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java Wed Jun  1 15:20:01 2005
@@ -64,7 +64,7 @@
       urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
 
       protocol = ProtocolFactory.getProtocol(urlString);
-      content = protocol.getContent(urlString);
+      content = protocol.getProtocolOutput(urlString).getContent();
 
       parser = ParserFactory.getParser(content.getContentType(), urlString);
       parse = parser.getParse(content);

Modified: incubator/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java (original)
+++ incubator/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Wed Jun  1 15:20:01 2005
@@ -23,12 +23,12 @@
 import org.apache.nutch.util.*;
 
 public class TextParser implements Parser {
-  public Parse getParse(Content content) throws ParseException {
+  public Parse getParse(Content content) {
     // copy content meta data through
     Properties metadata = new Properties();
     metadata.putAll(content.getMetadata());
 
-    ParseData parseData = new ParseData("", new Outlink[0], metadata);
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new Outlink[0], metadata);
 
     String encoding =
       StringUtil.parseCharacterEncoding(content.getContentType());
@@ -37,7 +37,7 @@
       try {                                       // try to use named encoding
         text = new String(content.getContent(), encoding);
       } catch (java.io.UnsupportedEncodingException e) {
-        throw new ParseException(e);
+        return new ParseStatus(e).getEmptyParse();
       }
     } else {
       // FIXME: implement charset detector. This code causes problem when 

Modified: incubator/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java (original)
+++ incubator/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java Wed Jun  1 15:20:01 2005
@@ -17,24 +17,24 @@
 package org.apache.nutch.protocol.file;
 
 
+import org.apache.nutch.db.Page;
 import org.apache.nutch.net.protocols.HttpDateFormat;
 
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
 
+import org.apache.nutch.pagedb.FetchListEntry;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
 
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
+import java.net.MalformedURLException;
 import java.net.URL;
 
-import java.io.InputStream;
-// 20040528, xing, disabled for now
-//import java.io.Reader;
-import java.io.IOException;
-
 /************************************
  * File.java deals with file: scheme.
  *
@@ -65,9 +65,20 @@
   }
 
   /** Set the point at which content is truncated. */
-  public void setMaxContentLength(int length) {this.maxContentLength = length;}
+  public void setMaxContentLength(int length) {maxContentLength = length;}
 
-  public Content getContent(String urlString) throws FileException {
+  public ProtocolOutput getProtocolOutput(String urlString) {
+    ProtocolOutput output = null;
+    try {
+      return getProtocolOutput(new FetchListEntry(true,
+            new Page(urlString, 1.0f), new String[0]));
+    } catch (MalformedURLException mue) {
+      return new ProtocolOutput(null, new ProtocolStatus(mue));
+    }
+  }
+  
+  public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
+    String urlString = fle.getUrl().toString();
     try {
       URL url = new URL(urlString);
   
@@ -80,7 +91,7 @@
         int code = response.getCode();
   
         if (code == 200) {                          // got a good response
-          return response.toContent();              // return it
+          return new ProtocolOutput(response.toContent());              // return it
   
         } else if (code >= 300 && code < 400) {     // handle redirect
           if (redirects == MAX_REDIRECTS)
@@ -94,8 +105,8 @@
           throw new FileError(code);
         }
       } 
-    } catch (IOException e) {
-      throw new FileException(e);
+    } catch (Exception e) {
+      return new ProtocolOutput(null, new ProtocolStatus(e));
     }
   }
 
@@ -139,7 +150,7 @@
     // set log level
     LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
 
-    Content content = file.getContent(urlString);
+    Content content = file.getProtocolOutput(urlString).getContent();
 
     System.err.println("Content-Type: " + content.getContentType());
     System.err.println("Content-Length: " + content.get("Content-Length"));

Modified: incubator/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java (original)
+++ incubator/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java Wed Jun  1 15:20:01 2005
@@ -19,22 +19,24 @@
 
 import org.apache.commons.net.ftp.FTPFileEntryParser;
 
+import org.apache.nutch.db.Page;
 import org.apache.nutch.net.protocols.HttpDateFormat;
 
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
 
+import org.apache.nutch.pagedb.FetchListEntry;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
 
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
+import java.net.MalformedURLException;
 import java.net.URL;
 
-import java.io.InputStream;
-// 20040528, xing, disabled for now
-//import java.io.Reader;
 import java.io.IOException;
 
 /************************************
@@ -91,13 +93,13 @@
   }
 
   /** Set the timeout. */
-  public void setTimeout(int timeout) {
-    this.timeout = timeout;
+  public void setTimeout(int to) {
+    timeout = to;
   }
 
   /** Set the point at which content is truncated. */
   public void setMaxContentLength(int length) {
-    this.maxContentLength = length;
+    maxContentLength = length;
   }
 
   /** Set followTalk */
@@ -110,7 +112,18 @@
     this.keepConnection = keepConnection;
   }
 
-  public Content getContent(String urlString) throws FtpException {
+  public ProtocolOutput getProtocolOutput(String urlString) {
+    ProtocolOutput output = null;
+    try {
+      return getProtocolOutput(new FetchListEntry(true,
+            new Page(urlString, 1.0f), new String[0]));
+    } catch (MalformedURLException mue) {
+      return new ProtocolOutput(null, new ProtocolStatus(mue));
+    }
+  }
+  
+  public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
+    String urlString = fle.getUrl().toString();
     try {
       URL url = new URL(urlString);
   
@@ -123,7 +136,7 @@
         int code = response.getCode();
   
         if (code == 200) {                          // got a good response
-          return response.toContent();              // return it
+          return new ProtocolOutput(response.toContent());              // return it
   
         } else if (code >= 300 && code < 400) {     // handle redirect
           if (redirects == MAX_REDIRECTS)
@@ -137,8 +150,8 @@
           throw new FtpError(code);
         }
       } 
-    } catch (IOException e) {
-      throw new FtpException(e);
+    } catch (Exception e) {
+      return new ProtocolOutput(null, new ProtocolStatus(e));
     }
   }
 
@@ -205,7 +218,7 @@
     // set log level
     LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
 
-    Content content = ftp.getContent(urlString);
+    Content content = ftp.getProtocolOutput(urlString).getContent();
 
     System.err.println("Content-Type: " + content.getContentType());
     System.err.println("Content-Length: " + content.get("Content-Length"));

Modified: incubator/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java (original)
+++ incubator/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java Wed Jun  1 15:20:01 2005
@@ -16,7 +16,7 @@
 
 package org.apache.nutch.protocol.http;
 
-import java.io.*;
+import java.net.MalformedURLException;
 import java.net.URL;
 import java.net.InetAddress;
 import java.net.UnknownHostException;
@@ -28,6 +28,8 @@
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
 
+import org.apache.nutch.db.Page;
+import org.apache.nutch.pagedb.FetchListEntry;
 import org.apache.nutch.protocol.*;
 
 /** An implementation of the Http protocol. */
@@ -170,7 +172,18 @@
     }
   }
 
-  public Content getContent(String urlString) throws ProtocolException {
+  public ProtocolOutput getProtocolOutput(String urlString) {
+    ProtocolOutput output = null;
+    try {
+      return getProtocolOutput(new FetchListEntry(true,
+            new Page(urlString, 1.0f), new String[0]));
+    } catch (MalformedURLException mue) {
+      return new ProtocolOutput(null, new ProtocolStatus(mue));
+    }
+  }
+  
+  public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
+    String urlString = fle.getUrl().toString();
     try {
       URL url = new URL(urlString);
 
@@ -191,7 +204,7 @@
         int code = response.getCode();
         
         if (code == 200) {                        // got a good response
-          return response.toContent();            // return it
+          return new ProtocolOutput(response.toContent());            // return it
           
         } else if (code == 410) {                 // page is gone
           throw new ResourceGone(url, "Http: " + code);
@@ -207,8 +220,8 @@
           throw new HttpError(code);
         }
       }
-    } catch (IOException e) {
-      throw new HttpException(e);
+    } catch (Exception e) {
+      return new ProtocolOutput(null, new ProtocolStatus(e));
     } 
   }
 
@@ -285,7 +298,7 @@
       LOG.setLevel(Level.FINE);
     }
 
-    Content content = http.getContent(url);
+    Content content = http.getProtocolOutput(url).getContent();
 
     System.out.println("Content Type: " + content.getContentType());
     System.out.println("Content Length: " + content.get("Content-Length"));

Added: incubator/nutch/trunk/src/plugin/protocol-httpclient/build.xml
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/protocol-httpclient/build.xml?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/plugin/protocol-httpclient/build.xml (added)
+++ incubator/nutch/trunk/src/plugin/protocol-httpclient/build.xml Wed Jun  1 15:20:01 2005
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+
+<project name="protocol-httpclient" default="jar">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

Propchange: incubator/nutch/trunk/src/plugin/protocol-httpclient/build.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-codec.jar
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-codec.jar?rev=179436&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-codec.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: incubator/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-httpclient-3.0-rc2.jar
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-httpclient-3.0-rc2.jar?rev=179436&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-httpclient-3.0-rc2.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: incubator/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml (added)
+++ incubator/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml Wed Jun  1 15:20:01 2005
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="protocol-httpclient"
+   name="Http / Https Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <extension-point
+      id="org.apache.nutch.protocol.Protocol"
+      name="Nutch Protocol"/>
+
+   <runtime>
+      <library name="protocol-httpclient.jar">
+         <export name="*"/>
+      </library>
+      <library name="commons-codec.jar" />
+      <library name="commons-httpclient-3.0-rc2.jar" />
+      
+   </runtime>
+
+   <extension id="org.apache.nutch.protocol.httpclient"
+	   name="HttpProtocol"
+              point="org.apache.nutch.protocol.Protocol">
+
+      <implementation id="org.apache.nutch.protocol.httpclient.Http"
+                      class="org.apache.nutch.protocol.httpclient.Http"
+                      protocolName="http"/>
+
+   </extension>
+
+   <extension id="org.apache.nutch.protocol.https"
+              name="HttpsProtocol"
+              point="org.apache.nutch.protocol.Protocol">
+
+      <implementation id="org.apache.nutch.protocol.httpclient.Http"
+                      class="org.apache.nutch.protocol.httpclient.Http"
+                      protocolName="https"/>
+
+   </extension>
+
+</plugin>

Propchange: incubator/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java (added)
+++ incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java Wed Jun  1 15:20:01 2005
@@ -0,0 +1,129 @@
+/*
+ * Based on EasySSLProtocolSocketFactory from commons-httpclient:
+ * 
+ * $Header:
+ * /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons//httpclient/src/contrib/org/apache/commons/httpclient/contrib/ssl/DummySSLProtocolSocketFactory.java,v
+ * 1.7 2004/06/11 19:26:27 olegk Exp $ $Revision$ $Date: 2005-02-26 05:01:52
+ * -0800 (Sat, 26 Feb 2005) $
+ */
+
+package org.apache.nutch.protocol.httpclient;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.Socket;
+import java.net.UnknownHostException;
+
+import org.apache.commons.httpclient.ConnectTimeoutException;
+import org.apache.commons.httpclient.HttpClientError;
+import org.apache.commons.httpclient.params.HttpConnectionParams;
+import org.apache.commons.httpclient.protocol.ControllerThreadSocketFactory;
+import org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import com.sun.net.ssl.SSLContext;
+import com.sun.net.ssl.TrustManager;
+
+public class DummySSLProtocolSocketFactory implements SecureProtocolSocketFactory {
+
+  /** Log object for this class. */
+  private static final Log LOG = LogFactory.getLog(DummySSLProtocolSocketFactory.class);
+
+  private SSLContext sslcontext = null;
+
+  /**
+   * Constructor for DummySSLProtocolSocketFactory.
+   */
+  public DummySSLProtocolSocketFactory() {
+    super();
+  }
+
+  private static SSLContext createEasySSLContext() {
+    try {
+      SSLContext context = SSLContext.getInstance("SSL");
+      context.init(null, new TrustManager[] { new DummyX509TrustManager(null) }, null);
+      return context;
+    } catch (Exception e) {
+      LOG.error(e.getMessage(), e);
+      throw new HttpClientError(e.toString());
+    }
+  }
+
+  private SSLContext getSSLContext() {
+    if (this.sslcontext == null) {
+      this.sslcontext = createEasySSLContext();
+    }
+    return this.sslcontext;
+  }
+
+  /**
+   * @see SecureProtocolSocketFactory#createSocket(java.lang.String,int,java.net.InetAddress,int)
+   */
+  public Socket createSocket(String host, int port, InetAddress clientHost, int clientPort) throws IOException,
+          UnknownHostException {
+
+    return getSSLContext().getSocketFactory().createSocket(host, port, clientHost, clientPort);
+  }
+
+  /**
+   * Attempts to get a new socket connection to the given host within the given
+   * time limit.
+   * <p>
+   * To circumvent the limitations of older JREs that do not support connect
+   * timeout a controller thread is executed. The controller thread attempts to
+   * create a new socket within the given limit of time. If socket constructor
+   * does not return until the timeout expires, the controller terminates and
+   * throws an {@link ConnectTimeoutException}
+   * </p>
+   * 
+   * @param host the host name/IP
+   * @param port the port on the host
+   * @param clientHost the local host name/IP to bind the socket to
+   * @param clientPort the port on the local machine
+   * @param params {@link HttpConnectionParams Http connection parameters}
+   * 
+   * @return Socket a new socket
+   * 
+   * @throws IOException if an I/O error occurs while creating the socket
+   * @throws UnknownHostException if the IP address of the host cannot be
+   *         determined
+   */
+  public Socket createSocket(final String host, final int port, final InetAddress localAddress, final int localPort,
+          final HttpConnectionParams params) throws IOException, UnknownHostException, ConnectTimeoutException {
+    if (params == null) {
+      throw new IllegalArgumentException("Parameters may not be null");
+    }
+    int timeout = params.getConnectionTimeout();
+    if (timeout == 0) {
+      return createSocket(host, port, localAddress, localPort);
+    } else {
+      // To be eventually deprecated when migrated to Java 1.4 or above
+      return ControllerThreadSocketFactory.createSocket(this, host, port, localAddress, localPort, timeout);
+    }
+  }
+
+  /**
+   * @see SecureProtocolSocketFactory#createSocket(java.lang.String,int)
+   */
+  public Socket createSocket(String host, int port) throws IOException, UnknownHostException {
+    return getSSLContext().getSocketFactory().createSocket(host, port);
+  }
+
+  /**
+   * @see SecureProtocolSocketFactory#createSocket(java.net.Socket,java.lang.String,int,boolean)
+   */
+  public Socket createSocket(Socket socket, String host, int port, boolean autoClose) throws IOException,
+          UnknownHostException {
+    return getSSLContext().getSocketFactory().createSocket(socket, host, port, autoClose);
+  }
+
+  public boolean equals(Object obj) {
+    return ((obj != null) && obj.getClass().equals(DummySSLProtocolSocketFactory.class));
+  }
+
+  public int hashCode() {
+    return DummySSLProtocolSocketFactory.class.hashCode();
+  }
+
+}

Propchange: incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
------------------------------------------------------------------------------
    svn:eol-style = native