You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2005/06/02 00:20:05 UTC
svn commit: r179436 [2/3] - in /incubator/nutch/trunk: ./ conf/
src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/
src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/protocol/
src/java/org/apache/nutch/tools/ src/plugin/
src/plugin/creativecommons/src/java/org/creativecommons/nutch/
src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/
src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/
src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/
src/plugin/parse-html/ src/plugin/parse-html/lib/
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/
src/plugin/parse-html/src/test/org/apache/nutch/parse/html/
src/plugin/parse-js/ src/plugin/parse-js/src/ src/plugin/parse-js/src/java/
src/plugin/parse-js/src/java/org/ src/plugin/parse-js/src/java/org/apache/
src/plugin/parse-js/src/java/org/apache/nutch/
src/plugin/parse-js/src/java/org/apache/nutch/parse/
src/plugin/parse-js/src/java/org/apache/nutch/parse/js/
src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/
src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/
src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/
src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/
src/plugin/parse-text/src/java/org/apache/nutch/parse/text/
src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/
src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/
src/plugin/protocol-httpclient/ src/plugin/protocol-httpclient/lib/
src/plugin/protocol-httpclient/src/ src/plugin/protocol-httpclient/src/java/
src/plugin/protocol-httpclient/src/java/org/
src/plugin/protocol-httpclient/src/java/org/apache/
src/plugin/protocol-httpclient/src/java/org/apache/nutch/
src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/
src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/
src/test/org/apache/nutch/fetcher/ src/test/org/apache/nutch/parse/
src/test/org/apache/nutch/tools/
Added: incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java (added)
+++ incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java Wed Jun 1 15:20:01 2005
@@ -0,0 +1,738 @@
+/*
+ * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
+ * XXX distribution, org.apache.xml.utils.DOMBuilder, in order to
+ * avoid dependency on Xalan.
+ */
+
+/*
+ * Copyright 1999-2004 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Id: DOMBuilder.java,v 1.19 2004/02/25 13:07:51 aruny Exp $
+ */
+package org.apache.nutch.parse.html;
+
+import java.util.Stack;
+
+import org.w3c.dom.Comment;
+import org.w3c.dom.Document;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.Text;
+import org.w3c.dom.CDATASection;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.ext.LexicalHandler;
+/**
+ * This class takes SAX events (in addition to some extra events
+ * that SAX doesn't handle yet) and adds the result to a document
+ * or document fragment.
+ * @xsl.usage general
+ */
+public class DOMBuilder
+ implements ContentHandler, LexicalHandler
+{
+
+ /** Root document */
+ public Document m_doc;
+
+ /** Current node */
+ protected Node m_currentNode = null;
+
+ /** First node of document fragment or null if not a DocumentFragment */
+ public DocumentFragment m_docFrag = null;
+
+ /** Vector of element nodes */
+ protected Stack m_elemStack = new Stack();
+
+ /**
+ * DOMBuilder instance constructor... it will add the DOM nodes
+ * to the document fragment.
+ *
+ * @param doc Root document
+ * @param node Current node
+ */
+ public DOMBuilder(Document doc, Node node)
+ {
+ m_doc = doc;
+ m_currentNode = node;
+ }
+
+ /**
+ * DOMBuilder instance constructor... it will add the DOM nodes
+ * to the document fragment.
+ *
+ * @param doc Root document
+ * @param docFrag Document fragment
+ */
+ public DOMBuilder(Document doc, DocumentFragment docFrag)
+ {
+ m_doc = doc;
+ m_docFrag = docFrag;
+ }
+
+ /**
+ * DOMBuilder instance constructor... it will add the DOM nodes
+ * to the document.
+ *
+ * @param doc Root document
+ */
+ public DOMBuilder(Document doc)
+ {
+ m_doc = doc;
+ }
+
+ /**
+ * Get the root node of the DOM being created. This
+ * is either a Document or a DocumentFragment.
+ *
+ * @return The root document or document fragment if not null
+ */
+ public Node getRootNode()
+ {
+ return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc;
+ }
+
+ /**
+ * Get the node currently being processed.
+ *
+ * @return the current node being processed
+ */
+ public Node getCurrentNode()
+ {
+ return m_currentNode;
+ }
+
+ /**
+ * Return null since there is no Writer for this class.
+ *
+ * @return null
+ */
+ public java.io.Writer getWriter()
+ {
+ return null;
+ }
+
+ /**
+ * Append a node to the current container.
+ *
+ * @param newNode New node to append
+ */
+ protected void append(Node newNode) throws org.xml.sax.SAXException
+ {
+
+ Node currentNode = m_currentNode;
+
+ if (null != currentNode)
+ {
+ currentNode.appendChild(newNode);
+
+ // System.out.println(newNode.getNodeName());
+ }
+ else if (null != m_docFrag)
+ {
+ m_docFrag.appendChild(newNode);
+ }
+ else
+ {
+ boolean ok = true;
+ short type = newNode.getNodeType();
+
+ if (type == Node.TEXT_NODE)
+ {
+ String data = newNode.getNodeValue();
+
+ if ((null != data) && (data.trim().length() > 0))
+ {
+ throw new org.xml.sax.SAXException("Warning: can't output text before document element! Ignoring...");
+ }
+
+ ok = false;
+ }
+ else if (type == Node.ELEMENT_NODE)
+ {
+ if (m_doc.getDocumentElement() != null)
+ {
+ throw new org.xml.sax.SAXException("Can't have more than one root on a DOM!");
+ }
+ }
+
+ if (ok)
+ m_doc.appendChild(newNode);
+ }
+ }
+
+ /**
+ * Receive an object for locating the origin of SAX document events.
+ *
+ * <p>SAX parsers are strongly encouraged (though not absolutely
+ * required) to supply a locator: if it does so, it must supply
+ * the locator to the application by invoking this method before
+ * invoking any of the other methods in the ContentHandler
+ * interface.</p>
+ *
+ * <p>The locator allows the application to determine the end
+ * position of any document-related event, even if the parser is
+ * not reporting an error. Typically, the application will
+ * use this information for reporting its own errors (such as
+ * character content that does not match an application's
+ * business rules). The information returned by the locator
+ * is probably not sufficient for use with a search engine.</p>
+ *
+ * <p>Note that the locator will return correct information only
+ * during the invocation of the events in this interface. The
+ * application should not attempt to use it at any other time.</p>
+ *
+ * @param locator An object that can return the location of
+ * any SAX document event.
+ * @see org.xml.sax.Locator
+ */
+ public void setDocumentLocator(Locator locator)
+ {
+
+ // No action for the moment.
+ }
+
+ /**
+ * Receive notification of the beginning of a document.
+ *
+ * <p>The SAX parser will invoke this method only once, before any
+ * other methods in this interface or in DTDHandler (except for
+ * setDocumentLocator).</p>
+ */
+ public void startDocument() throws org.xml.sax.SAXException
+ {
+
+ // No action for the moment.
+ }
+
+ /**
+ * Receive notification of the end of a document.
+ *
+ * <p>The SAX parser will invoke this method only once, and it will
+ * be the last method invoked during the parse. The parser shall
+ * not invoke this method until it has either abandoned parsing
+ * (because of an unrecoverable error) or reached the end of
+ * input.</p>
+ */
+ public void endDocument() throws org.xml.sax.SAXException
+ {
+
+ // No action for the moment.
+ }
+
+ /**
+ * Receive notification of the beginning of an element.
+ *
+ * <p>The Parser will invoke this method at the beginning of every
+ * element in the XML document; there will be a corresponding
+ * endElement() event for every startElement() event (even when the
+ * element is empty). All of the element's content will be
+ * reported, in order, before the corresponding endElement()
+ * event.</p>
+ *
+ * <p>If the element name has a namespace prefix, the prefix will
+ * still be attached. Note that the attribute list provided will
+ * contain only attributes with explicit values (specified or
+ * defaulted): #IMPLIED attributes will be omitted.</p>
+ *
+ *
+ * @param ns The namespace of the node
+ * @param localName The local part of the qualified name
+ * @param name The element name.
+ * @param atts The attributes attached to the element, if any.
+ * @see #endElement
+ * @see org.xml.sax.Attributes
+ */
+ public void startElement(
+ String ns, String localName, String name, Attributes atts)
+ throws org.xml.sax.SAXException
+ {
+
+ Element elem;
+
+ // Note that the namespace-aware call must be used to correctly
+ // construct a Level 2 DOM, even for non-namespaced nodes.
+ if ((null == ns) || (ns.length() == 0))
+ elem = m_doc.createElementNS(null,name);
+ else
+ elem = m_doc.createElementNS(ns, name);
+
+ append(elem);
+
+ try
+ {
+ int nAtts = atts.getLength();
+
+ if (0 != nAtts)
+ {
+ for (int i = 0; i < nAtts; i++)
+ {
+
+ //System.out.println("type " + atts.getType(i) + " name " + atts.getLocalName(i) );
+ // First handle a possible ID attribute
+ if (atts.getType(i).equalsIgnoreCase("ID"))
+ setIDAttribute(atts.getValue(i), elem);
+
+ String attrNS = atts.getURI(i);
+
+ if("".equals(attrNS))
+ attrNS = null; // DOM represents no-namespace as null
+
+ // System.out.println("attrNS: "+attrNS+", localName: "+atts.getQName(i)
+ // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i));
+ // Crimson won't let us set an xmlns: attribute on the DOM.
+ String attrQName = atts.getQName(i);
+
+ // In SAX, xmlns: attributes have an empty namespace, while in DOM they should have the xmlns namespace
+ if (attrQName.startsWith("xmlns:"))
+ attrNS = "http://www.w3.org/2000/xmlns/";
+
+ // ALWAYS use the DOM Level 2 call!
+ elem.setAttributeNS(attrNS,attrQName, atts.getValue(i));
+ }
+ }
+
+ // append(elem);
+
+ m_elemStack.push(elem);
+
+ m_currentNode = elem;
+
+ // append(elem);
+ }
+ catch(java.lang.Exception de)
+ {
+ // de.printStackTrace();
+ throw new org.xml.sax.SAXException(de);
+ }
+
+ }
+
+ /**
+
+
+
+ * Receive notification of the end of an element.
+ *
+ * <p>The SAX parser will invoke this method at the end of every
+ * element in the XML document; there will be a corresponding
+ * startElement() event for every endElement() event (even when the
+ * element is empty).</p>
+ *
+ * <p>If the element name has a namespace prefix, the prefix will
+ * still be attached to the name.</p>
+ *
+ *
+ * @param ns the namespace of the element
+ * @param localName The local part of the qualified name of the element
+ * @param name The element name
+ */
+ public void endElement(String ns, String localName, String name)
+ throws org.xml.sax.SAXException
+ {
+ m_elemStack.pop();
+ m_currentNode = m_elemStack.isEmpty() ? null : (Node)m_elemStack.peek();
+ }
+
+ /**
+ * Set an ID string to node association in the ID table.
+ *
+ * @param id The ID string.
+ * @param elem The associated ID.
+ */
+ public void setIDAttribute(String id, Element elem)
+ {
+
+ // Do nothing. This method is meant to be overiden.
+ }
+
+ /**
+ * Receive notification of character data.
+ *
+ * <p>The Parser will call this method to report each chunk of
+ * character data. SAX parsers may return all contiguous character
+ * data in a single chunk, or they may split it into several
+ * chunks; however, all of the characters in any single event
+ * must come from the same external entity, so that the Locator
+ * provides useful information.</p>
+ *
+ * <p>The application must not attempt to read from the array
+ * outside of the specified range.</p>
+ *
+ * <p>Note that some parsers will report whitespace using the
+ * ignorableWhitespace() method rather than this one (validating
+ * parsers must do so).</p>
+ *
+ * @param ch The characters from the XML document.
+ * @param start The start position in the array.
+ * @param length The number of characters to read from the array.
+ * @see #ignorableWhitespace
+ * @see org.xml.sax.Locator
+ */
+ public void characters(char ch[], int start, int length) throws org.xml.sax.SAXException
+ {
+ if(isOutsideDocElem()
+ && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+ return; // avoid DOM006 Hierarchy request error
+
+ if (m_inCData)
+ {
+ cdata(ch, start, length);
+
+ return;
+ }
+
+ String s = new String(ch, start, length);
+ Node childNode;
+ childNode = m_currentNode != null ? m_currentNode.getLastChild(): null;
+ if( childNode != null && childNode.getNodeType() == Node.TEXT_NODE ){
+ ((Text)childNode).appendData(s);
+ }
+ else{
+ Text text = m_doc.createTextNode(s);
+ append(text);
+ }
+ }
+
+ /**
+ * If available, when the disable-output-escaping attribute is used,
+ * output raw text without escaping. A PI will be inserted in front
+ * of the node with the name "lotusxsl-next-is-raw" and a value of
+ * "formatter-to-dom".
+ *
+ * @param ch Array containing the characters
+ * @param start Index to start of characters in the array
+ * @param length Number of characters in the array
+ */
+ public void charactersRaw(char ch[], int start, int length)
+ throws org.xml.sax.SAXException
+ {
+ if(isOutsideDocElem()
+ && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+ return; // avoid DOM006 Hierarchy request error
+
+
+ String s = new String(ch, start, length);
+
+ append(m_doc.createProcessingInstruction("xslt-next-is-raw",
+ "formatter-to-dom"));
+ append(m_doc.createTextNode(s));
+ }
+
+ /**
+ * Report the beginning of an entity.
+ *
+ * The start and end of the document entity are not reported.
+ * The start and end of the external DTD subset are reported
+ * using the pseudo-name "[dtd]". All other events must be
+ * properly nested within start/end entity events.
+ *
+ * @param name The name of the entity. If it is a parameter
+ * entity, the name will begin with '%'.
+ * @see #endEntity
+ * @see org.xml.sax.ext.DeclHandler#internalEntityDecl
+ * @see org.xml.sax.ext.DeclHandler#externalEntityDecl
+ */
+ public void startEntity(String name) throws org.xml.sax.SAXException
+ {
+
+ // Almost certainly the wrong behavior...
+ // entityReference(name);
+ }
+
+ /**
+ * Report the end of an entity.
+ *
+ * @param name The name of the entity that is ending.
+ * @see #startEntity
+ */
+ public void endEntity(String name) throws org.xml.sax.SAXException{}
+
+ /**
+ * Receive notivication of a entityReference.
+ *
+ * @param name name of the entity reference
+ */
+ public void entityReference(String name) throws org.xml.sax.SAXException
+ {
+ append(m_doc.createEntityReference(name));
+ }
+
+ /**
+ * Receive notification of ignorable whitespace in element content.
+ *
+ * <p>Validating Parsers must use this method to report each chunk
+ * of ignorable whitespace (see the W3C XML 1.0 recommendation,
+ * section 2.10): non-validating parsers may also use this method
+ * if they are capable of parsing and using content models.</p>
+ *
+ * <p>SAX parsers may return all contiguous whitespace in a single
+ * chunk, or they may split it into several chunks; however, all of
+ * the characters in any single event must come from the same
+ * external entity, so that the Locator provides useful
+ * information.</p>
+ *
+ * <p>The application must not attempt to read from the array
+ * outside of the specified range.</p>
+ *
+ * @param ch The characters from the XML document.
+ * @param start The start position in the array.
+ * @param length The number of characters to read from the array.
+ * @see #characters
+ */
+ public void ignorableWhitespace(char ch[], int start, int length)
+ throws org.xml.sax.SAXException
+ {
+ if(isOutsideDocElem())
+ return; // avoid DOM006 Hierarchy request error
+
+ String s = new String(ch, start, length);
+
+ append(m_doc.createTextNode(s));
+ }
+
+ /**
+ * Tell if the current node is outside the document element.
+ *
+ * @return true if the current node is outside the document element.
+ */
+ private boolean isOutsideDocElem()
+ {
+ return (null == m_docFrag) && m_elemStack.size() == 0 && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE);
+ }
+
+ /**
+ * Receive notification of a processing instruction.
+ *
+ * <p>The Parser will invoke this method once for each processing
+ * instruction found: note that processing instructions may occur
+ * before or after the main document element.</p>
+ *
+ * <p>A SAX parser should never report an XML declaration (XML 1.0,
+ * section 2.8) or a text declaration (XML 1.0, section 4.3.1)
+ * using this method.</p>
+ *
+ * @param target The processing instruction target.
+ * @param data The processing instruction data, or null if
+ * none was supplied.
+ */
+ public void processingInstruction(String target, String data)
+ throws org.xml.sax.SAXException
+ {
+ append(m_doc.createProcessingInstruction(target, data));
+ }
+
+ /**
+ * Report an XML comment anywhere in the document.
+ *
+ * This callback will be used for comments inside or outside the
+ * document element, including comments in the external DTD
+ * subset (if read).
+ *
+ * @param ch An array holding the characters in the comment.
+ * @param start The starting position in the array.
+ * @param length The number of characters to use from the array.
+ */
+ public void comment(char ch[], int start, int length) throws org.xml.sax.SAXException
+ {
+ append(m_doc.createComment(new String(ch, start, length)));
+ }
+
+ /** Flag indicating that we are processing a CData section */
+ protected boolean m_inCData = false;
+
+ /**
+ * Report the start of a CDATA section.
+ *
+ * @see #endCDATA
+ */
+ public void startCDATA() throws org.xml.sax.SAXException
+ {
+ m_inCData = true;
+ append(m_doc.createCDATASection(""));
+ }
+
+ /**
+ * Report the end of a CDATA section.
+ *
+ * @see #startCDATA
+ */
+ public void endCDATA() throws org.xml.sax.SAXException
+ {
+ m_inCData = false;
+ }
+
+ /**
+ * Receive notification of cdata.
+ *
+ * <p>The Parser will call this method to report each chunk of
+ * character data. SAX parsers may return all contiguous character
+ * data in a single chunk, or they may split it into several
+ * chunks; however, all of the characters in any single event
+ * must come from the same external entity, so that the Locator
+ * provides useful information.</p>
+ *
+ * <p>The application must not attempt to read from the array
+ * outside of the specified range.</p>
+ *
+ * <p>Note that some parsers will report whitespace using the
+ * ignorableWhitespace() method rather than this one (validating
+ * parsers must do so).</p>
+ *
+ * @param ch The characters from the XML document.
+ * @param start The start position in the array.
+ * @param length The number of characters to read from the array.
+ * @see #ignorableWhitespace
+ * @see org.xml.sax.Locator
+ */
+ public void cdata(char ch[], int start, int length) throws org.xml.sax.SAXException
+ {
+ if(isOutsideDocElem()
+ && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+ return; // avoid DOM006 Hierarchy request error
+
+ String s = new String(ch, start, length);
+
+ // XXX ab@apache.org: modified from the original, to accomodate TagSoup.
+ Node n = m_currentNode.getLastChild();
+ if (n instanceof CDATASection)
+ ((CDATASection)n).appendData(s);
+ else if (n instanceof Comment)
+ ((Comment)n).appendData(s);
+ }
+
+ /**
+ * Report the start of DTD declarations, if any.
+ *
+ * Any declarations are assumed to be in the internal subset
+ * unless otherwise indicated.
+ *
+ * @param name The document type name.
+ * @param publicId The declared public identifier for the
+ * external DTD subset, or null if none was declared.
+ * @param systemId The declared system identifier for the
+ * external DTD subset, or null if none was declared.
+ * @see #endDTD
+ * @see #startEntity
+ */
+ public void startDTD(String name, String publicId, String systemId)
+ throws org.xml.sax.SAXException
+ {
+
+ // Do nothing for now.
+ }
+
+ /**
+ * Report the end of DTD declarations.
+ *
+ * @see #startDTD
+ */
+ public void endDTD() throws org.xml.sax.SAXException
+ {
+
+ // Do nothing for now.
+ }
+
+ /**
+ * Begin the scope of a prefix-URI Namespace mapping.
+ *
+ * <p>The information from this event is not necessary for
+ * normal Namespace processing: the SAX XML reader will
+ * automatically replace prefixes for element and attribute
+ * names when the http://xml.org/sax/features/namespaces
+ * feature is true (the default).</p>
+ *
+ * <p>There are cases, however, when applications need to
+ * use prefixes in character data or in attribute values,
+ * where they cannot safely be expanded automatically; the
+ * start/endPrefixMapping event supplies the information
+ * to the application to expand prefixes in those contexts
+ * itself, if necessary.</p>
+ *
+ * <p>Note that start/endPrefixMapping events are not
+ * guaranteed to be properly nested relative to each-other:
+ * all startPrefixMapping events will occur before the
+ * corresponding startElement event, and all endPrefixMapping
+ * events will occur after the corresponding endElement event,
+ * but their order is not guaranteed.</p>
+ *
+ * @param prefix The Namespace prefix being declared.
+ * @param uri The Namespace URI the prefix is mapped to.
+ * @see #endPrefixMapping
+ * @see #startElement
+ */
+ public void startPrefixMapping(String prefix, String uri)
+ throws org.xml.sax.SAXException
+ {
+
+ /*
+ // Not sure if this is needed or wanted
+ // Also, it fails in the stree.
+ if((null != m_currentNode)
+ && (m_currentNode.getNodeType() == Node.ELEMENT_NODE))
+ {
+ String qname;
+ if(((null != prefix) && (prefix.length() == 0))
+ || (null == prefix))
+ qname = "xmlns";
+ else
+ qname = "xmlns:"+prefix;
+
+ Element elem = (Element)m_currentNode;
+ String val = elem.getAttribute(qname); // Obsolete, should be DOM2...?
+ if(val == null)
+ {
+ elem.setAttributeNS("http://www.w3.org/XML/1998/namespace",
+ qname, uri);
+ }
+ }
+ */
+ }
+
+ /**
+ * End the scope of a prefix-URI mapping.
+ *
+ * <p>See startPrefixMapping for details. This event will
+ * always occur after the corresponding endElement event,
+ * but the order of endPrefixMapping events is not otherwise
+ * guaranteed.</p>
+ *
+ * @param prefix The prefix that was being mapping.
+ * @see #startPrefixMapping
+ * @see #endElement
+ */
+ public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException{}
+
+ /**
+ * Receive notification of a skipped entity.
+ *
+ * <p>The Parser will invoke this method once for each entity
+ * skipped. Non-validating processors may skip entities if they
+ * have not seen the declarations (because, for example, the
+ * entity was declared in an external DTD subset). All processors
+ * may skip external entities, depending on the values of the
+ * http://xml.org/sax/features/external-general-entities and the
+ * http://xml.org/sax/features/external-parameter-entities
+ * properties.</p>
+ *
+ * @param name The name of the skipped entity. If it is a
+ * parameter entity, the name will begin with '%'.
+ */
+ public void skippedEntity(String name) throws org.xml.sax.SAXException{}
+}
Propchange: incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java (added)
+++ incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java Wed Jun 1 15:20:01 2005
@@ -0,0 +1,186 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import java.net.URL;
+import java.util.Properties;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.w3c.dom.*;
+
+/**
+ * Class for parsing META Directives from DOM trees. This class
+ * handles specifically Robots META directives (all, none, nofollow,
+ * noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache
+ * instructions. All meta directives are stored in a HTMLMetaTags instance.
+ */
+public class HTMLMetaProcessor {
+
+ /**
+ * Utility class with indicators for the robots directives "noindex"
+ * and "nofollow", and HTTP-EQUIV/no-cache
+ */
+
+ /**
+ * Sets the indicators in <code>robotsMeta</code> to appropriate
+ * values, based on any META tags found under the given
+ * <code>node</code>.
+ */
+ public static final void getMetaTags (
+ HTMLMetaTags metaTags, Node node, URL currURL) {
+
+ metaTags.reset();
+ getMetaTagsHelper(metaTags, node, currURL);
+ }
+
+ private static final void getMetaTagsHelper(
+ HTMLMetaTags metaTags, Node node, URL currURL) {
+
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+
+ if ("body".equalsIgnoreCase(node.getNodeName())) {
+ // META tags should not be under body
+ return;
+ }
+
+ if ("meta".equalsIgnoreCase(node.getNodeName())) {
+ NamedNodeMap attrs = node.getAttributes();
+ Node nameNode = attrs.getNamedItem("name");
+ Node equivNode = attrs.getNamedItem("http-equiv");
+ Node contentNode = attrs.getNamedItem("content");
+
+ if (nameNode != null) {
+ if (contentNode != null) {
+ String name = nameNode.getNodeValue().toLowerCase();
+ metaTags.getGeneralTags().setProperty(name, contentNode.getNodeValue());
+ if ("robots".equals(name)) {
+
+ if (contentNode != null) {
+ String directives =
+ contentNode.getNodeValue().toLowerCase();
+ int index = directives.indexOf("none");
+
+ if (index >= 0) {
+ metaTags.setNoIndex();
+ metaTags.setNoFollow();
+ }
+
+ index = directives.indexOf("all");
+ if (index >= 0) {
+ // do nothing...
+ }
+
+ index = directives.indexOf("noindex");
+ if (index >= 0) {
+ metaTags.setNoIndex();
+ }
+
+ index = directives.indexOf("nofollow");
+ if (index >= 0) {
+ metaTags.setNoFollow();
+ }
+ }
+
+ } // end if (name == robots)
+ }
+ }
+
+ if (equivNode != null) {
+ if (contentNode != null) {
+ String name = equivNode.getNodeValue().toLowerCase();
+ String content = contentNode.getNodeValue();
+ metaTags.getHttpEquivTags().setProperty(name, content);
+ if ("pragma".equals(name)) {
+ content = content.toLowerCase();
+ int index = content.indexOf("no-cache");
+ if (index >= 0)
+ metaTags.setNoCache();
+ } else if ("refresh".equals(name)) {
+ int idx = content.indexOf(';');
+ String time = null;
+ if (idx == -1) { // just the refresh time
+ time = content;
+ } else time = content.substring(0, idx);
+ try {
+ metaTags.setRefreshTime(Integer.parseInt(time));
+ // skip this if we couldn't parse the time
+ metaTags.setRefresh(true);
+ } catch (Exception e) {
+ ;
+ }
+ if (metaTags.getRefresh() && idx != -1) { // set the URL
+ idx = content.toLowerCase().indexOf("url=");
+ if (idx == -1) { // assume a mis-formatted entry with just the url
+ idx = content.indexOf(';') + 1;
+ } else idx += 4;
+ if (idx != -1) {
+ String url = content.substring(idx);
+ URL refreshUrl = null;
+ try {
+ refreshUrl = new URL(url);
+ } catch (Exception e) {
+ // this has to be an absolute url!
+ if (!url.startsWith("/")) url = "/" + url;
+ try {
+ refreshUrl = new URL(currURL, url);
+ } catch (Exception e1) {
+ ;
+ }
+ }
+ if (refreshUrl == null) metaTags.setRefresh(false);
+ metaTags.setRefreshHref(refreshUrl);
+ }
+ }
+ }
+ }
+ }
+
+ } else if ("base".equalsIgnoreCase(node.getNodeName())) {
+ NamedNodeMap attrs = node.getAttributes();
+ Node hrefNode = attrs.getNamedItem("href");
+
+ if (hrefNode != null) {
+ String urlString = hrefNode.getNodeValue();
+
+ URL url = null;
+ try {
+ if (currURL == null)
+ url = new URL(urlString);
+ else
+ url = new URL(currURL, urlString);
+ } catch (Exception e) {
+ ;
+ }
+
+ if (url != null)
+ metaTags.setBaseHref(url);
+ }
+
+ }
+
+ }
+
+ NodeList children = node.getChildNodes();
+ if (children != null) {
+ int len = children.getLength();
+ for (int i = 0; i < len; i++) {
+ getMetaTagsHelper(metaTags, children.item(i), currURL);
+ }
+ }
+ }
+
+}
Propchange: incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Wed Jun 1 15:20:01 2005
@@ -28,14 +28,11 @@
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.w3c.dom.*;
-import org.w3c.dom.html.*;
import org.apache.html.dom.*;
-import org.apache.nutch.fetcher.FetcherOutput;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.*;
import org.apache.nutch.parse.*;
-import org.apache.nutch.parse.html.RobotsMetaProcessor.*;
public class HtmlParser implements Parser {
@@ -52,6 +49,8 @@
private static Pattern charsetPattern =
Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)",
Pattern.CASE_INSENSITIVE);
+
+ private static String parserImpl = NutchConf.get().get("parser.html.impl", "neko");
/**
* Given a <code>byte[]</code> representing an html file of an
@@ -94,22 +93,14 @@
private static String defaultCharEncoding =
NutchConf.get().get("parser.character.encoding.default", "windows-1252");
- public Parse getParse(Content content) throws ParseException {
- DOMParser parser = new DOMParser();
-
- // some plugins, e.g., creativecommons, need to examine html comments
- try {
- parser.setFeature("http://apache.org/xml/features/include-comments",
- true);
- } catch (SAXException e) {}
-
- RobotsMetaIndicator robotsMeta = new RobotsMetaIndicator();
+ public Parse getParse(Content content) {
+ HTMLMetaTags metaTags = new HTMLMetaTags();
URL base;
try {
base = new URL(content.getBaseUrl());
} catch (MalformedURLException e) {
- throw new ParseException(e);
+ return new ParseStatus(e).getEmptyParse();
}
String text = "";
@@ -120,19 +111,18 @@
// check that contentType is one we can handle
String contentType = content.getContentType();
if (!"".equals(contentType) && !contentType.startsWith("text/html"))
- throw new ParseException("Content-Type not text/html: " + contentType);
+ return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
+ "Content-Type not text/html: " + contentType).getEmptyParse();
// parse the content
DocumentFragment root;
try {
byte[] contentInOctets = content.getContent();
- InputSource input =
- new InputSource(new ByteArrayInputStream(contentInOctets));
+ InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
String encoding = StringUtil.parseCharacterEncoding(contentType);
if (encoding!=null) {
metadata.put("OriginalCharEncoding", encoding);
if ((encoding = StringUtil.resolveEncodingAlias(encoding)) != null) {
- input.setEncoding(encoding);
metadata.put("CharEncodingForConversion", encoding);
LOG.fine(base + ": setting encoding to " + encoding);
}
@@ -144,7 +134,6 @@
if (encoding!=null) {
metadata.put("OriginalCharEncoding", encoding);
if ((encoding = StringUtil.resolveEncodingAlias(encoding)) != null) {
- input.setEncoding(encoding);
metadata.put("CharEncodingForConversion", encoding);
LOG.fine(base + ": setting encoding to " + encoding);
}
@@ -158,33 +147,29 @@
// (e.g. se: windows-1252, kr: x-windows-949, cn: gb18030, tw: big5
// doesn't work for jp because euc-jp and shift_jis have about the
// same share)
-
+ encoding = defaultCharEncoding;
metadata.put("CharEncodingForConversion", defaultCharEncoding);
- input.setEncoding(defaultCharEncoding);
LOG.fine(base + ": falling back to " + defaultCharEncoding);
}
-
+ input.setEncoding(encoding);
LOG.fine("Parsing...");
- parser.parse(input);
-
- // convert Document to DocumentFragment
- HTMLDocumentImpl doc = (HTMLDocumentImpl)parser.getDocument();
- doc.setErrorChecking(false);
- root = doc.createDocumentFragment();
- root.appendChild(doc.getDocumentElement());
+ root = parse(input);
} catch (IOException e) {
- throw new ParseException(e);
+ return new ParseStatus(e).getEmptyParse();
} catch (DOMException e) {
- throw new ParseException(e);
+ return new ParseStatus(e).getEmptyParse();
} catch (SAXException e) {
- throw new ParseException(e);
+ return new ParseStatus(e).getEmptyParse();
+ } catch (Exception e) {
+ e.printStackTrace();
+ return new ParseStatus(e).getEmptyParse();
}
// get meta directives
- RobotsMetaProcessor.getRobotsMetaDirectives(robotsMeta, root, base);
-
+ HTMLMetaProcessor.getMetaTags(metaTags, root, base);
+ LOG.info("Meta tags for " + base + ": " + metaTags.toString());
// check meta directives
- if (!robotsMeta.getNoIndex()) { // okay to index
+ if (!metaTags.getNoIndex()) { // okay to index
StringBuffer sb = new StringBuffer();
LOG.fine("Getting text...");
DOMContentUtils.getText(sb, root); // extract text
@@ -195,7 +180,7 @@
title = sb.toString().trim();
}
- if (!robotsMeta.getNoFollow()) { // okay to follow links
+ if (!metaTags.getNoFollow()) { // okay to follow links
ArrayList l = new ArrayList(); // extract outlinks
URL baseTag = DOMContentUtils.getBase(root);
LOG.fine("Getting links...");
@@ -204,20 +189,78 @@
LOG.fine("found "+outlinks.length+" outlinks in "+content.getUrl());
}
- if (!robotsMeta.getNoCache()) { // okay to cache
+ if (!metaTags.getNoCache()) { // okay to cache
// ??? FIXME ???
}
// copy content metadata through
metadata.putAll(content.getMetadata());
-
- ParseData parseData = new ParseData(title, outlinks, metadata);
+ ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
+ if (metaTags.getRefresh()) {
+ status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
+ status.setMessage(metaTags.getRefreshHref().toString());
+ }
+ ParseData parseData = new ParseData(status, title, outlinks, metadata);
Parse parse = new ParseImpl(text, parseData);
// run filters on parse
- return HtmlParseFilters.filter(content, parse, root);
+ return HtmlParseFilters.filter(content, parse, metaTags, root);
}
+ private DocumentFragment parse(InputSource input) throws Exception {
+ if (parserImpl.equalsIgnoreCase("tagsoup"))
+ return parseTagSoup(input);
+ else return parseNeko(input);
+ }
+
+ private DocumentFragment parseTagSoup(InputSource input) throws Exception {
+ HTMLDocumentImpl doc = new HTMLDocumentImpl();
+ DocumentFragment frag = doc.createDocumentFragment();
+ DOMBuilder builder = new DOMBuilder(doc, frag);
+ org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser();
+ reader.setContentHandler(builder);
+ reader.setFeature(reader.ignoreBogonsFeature, true);
+ reader.setFeature(reader.bogonsEmptyFeature, false);
+ reader.setProperty("http://xml.org/sax/properties/lexical-handler", builder);
+ reader.parse(input);
+ return frag;
+ }
+
+ private DocumentFragment parseNeko(InputSource input) throws Exception {
+ DOMFragmentParser parser = new DOMFragmentParser();
+ // some plugins, e.g., creativecommons, need to examine html comments
+ try {
+ parser.setFeature("http://apache.org/xml/features/include-comments",
+ true);
+ parser.setFeature("http://apache.org/xml/features/augmentations",
+ true);
+ parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
+ false);
+ parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment",
+ true);
+ parser.setFeature("http://cyberneko.org/html/features/report-errors",
+ true);
+ } catch (SAXException e) {}
+ // convert Document to DocumentFragment
+ HTMLDocumentImpl doc = new HTMLDocumentImpl();
+ doc.setErrorChecking(false);
+ DocumentFragment res = doc.createDocumentFragment();
+ DocumentFragment frag = doc.createDocumentFragment();
+ parser.parse(input, frag);
+ res.appendChild(frag);
+
+ try {
+ while(true) {
+ frag = doc.createDocumentFragment();
+ parser.parse(input, frag);
+ if (!frag.hasChildNodes()) break;
+ LOG.info(" - new frag, " + frag.getChildNodes().getLength() + " nodes.");
+ res.appendChild(frag);
+ }
+ } catch (Exception x) { x.printStackTrace();};
+ return res;
+ }
+
public static void main(String[] args) throws Exception {
LOG.setLevel(Level.FINE);
String name = args[0];
Added: incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java (added)
+++ incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java Wed Jun 1 15:20:01 2005
@@ -0,0 +1,113 @@
+/*
+ * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
+ * XXX distribution, org.apache.xml.utils.XMLCharacterRecognizer,
+ * XXX in order to avoid dependency on Xalan.
+ */
+
+/*
+ * Copyright 1999-2004 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Id: XMLCharacterRecognizer.java,v 1.7 2004/02/17 04:21:14 minchau Exp $
+ */
+package org.apache.nutch.parse.html;
+
+/**
+ * Class used to verify whether the specified <var>ch</var>
+ * conforms to the XML 1.0 definition of whitespace.
+ * @xsl.usage internal
+ */
+public class XMLCharacterRecognizer
+{
+
+ /**
+ * Returns whether the specified <var>ch</var> conforms to the XML 1.0 definition
+ * of whitespace. Refer to <A href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S">
+ * the definition of <CODE>S</CODE></A> for details.
+ * @param ch Character to check as XML whitespace.
+ * @return =true if <var>ch</var> is XML whitespace; otherwise =false.
+ */
+ public static boolean isWhiteSpace(char ch)
+ {
+ return (ch == 0x20) || (ch == 0x09) || (ch == 0xD) || (ch == 0xA);
+ }
+
+ /**
+ * Tell if the string is whitespace.
+ *
+ * @param ch Character array to check as XML whitespace.
+ * @param start Start index of characters in the array
+ * @param length Number of characters in the array
+ * @return True if the characters in the array are
+ * XML whitespace; otherwise, false.
+ */
+ public static boolean isWhiteSpace(char ch[], int start, int length)
+ {
+
+ int end = start + length;
+
+ for (int s = start; s < end; s++)
+ {
+ if (!isWhiteSpace(ch[s]))
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * Tell if the string is whitespace.
+ *
+ * @param buf StringBuffer to check as XML whitespace.
+ * @return True if characters in buffer are XML whitespace, false otherwise
+ */
+ public static boolean isWhiteSpace(StringBuffer buf)
+ {
+
+ int n = buf.length();
+
+ for (int i = 0; i < n; i++)
+ {
+ if (!isWhiteSpace(buf.charAt(i)))
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * Tell if the string is whitespace.
+ *
+ * @param buf StringBuffer to check as XML whitespace.
+ * @return True if characters in buffer are XML whitespace, false otherwise
+ */
+ public static boolean isWhiteSpace(String s)
+ {
+
+ if(null != s)
+ {
+ int n = s.length();
+
+ for (int i = 0; i < n; i++)
+ {
+ if (!isWhiteSpace(s.charAt(i)))
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+}
Propchange: incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: incubator/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ incubator/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Wed Jun 1 15:20:01 2005
@@ -205,6 +205,7 @@
new Outlink("http://www.nutch.org/frames/right.html", ""),
},
{
+ new Outlink("http://www.nutch.org/maps/logo.gif", ""),
new Outlink("http://www.nutch.org/index.html", ""),
new Outlink("http://www.nutch.org/maps/#bottom", ""),
new Outlink("http://www.nutch.org/bot.html", ""),
Modified: incubator/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java (original)
+++ incubator/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java Wed Jun 1 15:20:01 2005
@@ -18,7 +18,8 @@
import junit.framework.TestCase;
-import org.apache.nutch.parse.html.RobotsMetaProcessor.*;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.html.HTMLMetaProcessor.*;
import java.io.ByteArrayInputStream;
import java.net.URL;
@@ -28,7 +29,7 @@
import org.w3c.dom.*;
import org.apache.html.dom.*;
-/** Unit tests for RobotsMetaProcessor. */
+/** Unit tests for HTMLMetaProcessor. */
public class TestRobotsMetaProcessor extends TestCase {
public TestRobotsMetaProcessor(String name) {
super(name);
@@ -157,8 +158,8 @@
e.printStackTrace();
}
- RobotsMetaIndicator robotsMeta= new RobotsMetaIndicator();
- RobotsMetaProcessor.getRobotsMetaDirectives(robotsMeta, node,
+ HTMLMetaTags robotsMeta= new HTMLMetaTags();
+ HTMLMetaProcessor.getMetaTags(robotsMeta, node,
currURLsAndAnswers[i][0]);
assertTrue("got index wrong on test " + i,
Added: incubator/nutch/trunk/src/plugin/parse-js/build.xml
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-js/build.xml?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-js/build.xml (added)
+++ incubator/nutch/trunk/src/plugin/parse-js/build.xml Wed Jun 1 15:20:01 2005
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+
+<project name="parse-js" default="jar">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
Propchange: incubator/nutch/trunk/src/plugin/parse-js/build.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/nutch/trunk/src/plugin/parse-js/plugin.xml
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-js/plugin.xml?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-js/plugin.xml (added)
+++ incubator/nutch/trunk/src/plugin/parse-js/plugin.xml Wed Jun 1 15:20:01 2005
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+ id="parse-js"
+ name="JavaScript Parser"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <extension-point
+ id="org.apache.nutch.parse.HtmlParseFilter"
+ name="HTML Parse Filter"/>
+
+ <runtime>
+ <library name="parse-js.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <extension id="org.apache.nutch.parse.js.JSParseFilter"
+ name="Parse JS Filter"
+ point="org.apache.nutch.parse.HtmlParseFilter">
+ <implementation id="JSParseFilter"
+ class="org.apache.nutch.parse.js.JSParseFilter"
+ contentType="application/x-javascript"
+ pathSuffix=""/>
+ </extension>
+
+</plugin>
Propchange: incubator/nutch/trunk/src/plugin/parse-js/plugin.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html (added)
+++ incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html Wed Jun 1 15:20:01 2005
@@ -0,0 +1,6 @@
+<html>
+<body>
+<p>A parser plugin and content filter to extract all (possible) links
+from JavaScript files and code snippets.</p>
+</body>
+</html>
Propchange: incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (added)
+++ incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Wed Jun 1 15:20:01 2005
@@ -0,0 +1,226 @@
+package org.apache.nutch.parse.js;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Properties;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.LogFormatter;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.Pattern;
+import org.apache.oro.text.regex.PatternCompiler;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.PatternMatcherInput;
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+/**
+ * This class is a heuristic link extractor for JavaScript files and
+ * code snippets. The general idea of a two-pass regex matching comes from
+ * Heritrix. Parts of the code come from OutlinkExtractor.java
+ * by Stephan Strittmatter.
+ *
+ * @author Andrzej Bialecki <ab@getopt.org>
+ */
+public class JSParseFilter implements HtmlParseFilter, Parser {
+ public static final Logger LOG =
+ LogFormatter.getLogger("org.apache.nutch.parse.js.JSParseFilter");
+
+ private static final int MAX_TITLE_LEN = 80;
+
+ public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
+ String url = content.getBaseUrl();
+ ArrayList outlinks = new ArrayList();
+ walk(doc, parse, metaTags, url, outlinks);
+ if (outlinks.size() > 0) {
+ Outlink[] old = parse.getData().getOutlinks();
+ Properties metadata = parse.getData().getMetadata();
+ String title = parse.getData().getTitle();
+ List list = Arrays.asList(old);
+ outlinks.addAll(list);
+ ParseStatus status = parse.getData().getStatus();
+ String text = parse.getText();
+ Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
+ parse = new ParseImpl(text, new ParseData(status, title, newlinks, metadata));
+ }
+ return parse;
+ }
+
+ private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, List outlinks) {
+ if (n instanceof Element) {
+ String name = n.getNodeName();
+ if (name.equalsIgnoreCase("script")) {
+ String lang = null;
+ Node lNode = n.getAttributes().getNamedItem("language");
+ if (lNode == null) lang = "javascript";
+ else lang = lNode.getNodeValue();
+ StringBuffer script = new StringBuffer();
+ NodeList nn = n.getChildNodes();
+ if (nn.getLength() > 0) {
+ for (int i = 0; i < nn.getLength(); i++) {
+ if (i > 0) script.append('\n');
+ script.append(nn.item(i).getNodeValue());
+ }
+ //LOG.info("script: language=" + lang + ", text: " + script.toString());
+ Outlink[] links = getJSLinks(script.toString(), base, base);
+ if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
+ // no other children of interest here, go one level up.
+ return;
+ }
+ } else {
+ // process all HTML 4.0 events, if present...
+ NamedNodeMap attrs = n.getAttributes();
+ int len = attrs.getLength();
+ for (int i = 0; i < len; i++) {
+ // Window: onload,onunload
+ // Form: onchange,onsubmit,onreset,onselect,onblur,onfocus
+ // Keyboard: onkeydown,onkeypress,onkeyup
+ // Mouse: onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
+ Node anode = attrs.item(i);
+ if (anode.getNodeName().startsWith("on")) {
+ Outlink[] links = getJSLinks(anode.getNodeValue(), base, base);
+ if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
+ }
+ }
+ }
+ }
+ NodeList nl = n.getChildNodes();
+ for (int i = 0; i < nl.getLength(); i++) {
+ walk(nl.item(i), parse, metaTags, base, outlinks);
+ }
+ }
+
+ public Parse getParse(Content c) {
+ String type = c.getContentType();
+ if (type != null && !type.toLowerCase().startsWith("application/x-javascript"))
+ return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
+ "Content not JavaScript: " + type).getEmptyParse();
+ String script = new String(c.getContent());
+ Outlink[] outlinks = getJSLinks(script, c.getUrl(), c.getUrl());
+ if (outlinks == null) outlinks = new Outlink[0];
+ // Title? use the first line of the script...
+ String title;
+ int idx = script.indexOf('\n');
+ if (idx != -1) {
+ if (idx > MAX_TITLE_LEN) idx = MAX_TITLE_LEN;
+ title = script.substring(0, idx);
+ } else {
+ idx = Math.min(MAX_TITLE_LEN, script.length());
+ title = script.substring(0, idx);
+ }
+ Properties metadata = new Properties();
+ metadata.putAll(c.getMetadata());
+ ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title,
+ outlinks, metadata);
+ Parse parse = new ParseImpl(script, pd);
+ return parse;
+ }
+
+ private static final String STRING_PATTERN = "(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)";
+ // A simple pattern. This allows also invalid URL characters.
+ private static final String URI_PATTERN = "(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)";
+ // Alternative pattern, which limits valid url characters.
+ //private static final String URI_PATTERN = "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)";
+
+ /**
+ * This method extracts URLs from literals embedded in JavaScript.
+ */
+ private static Outlink[] getJSLinks(String plainText, String anchor, String base) {
+
+ final List outlinks = new ArrayList();
+ URL baseURL = null;
+
+ try {
+ baseURL = new URL(base);
+ } catch (Exception e) {
+ LOG.throwing(JSParseFilter.class.getName(), "getJSLinks", e);
+ }
+
+ try {
+ final PatternCompiler cp = new Perl5Compiler();
+ final Pattern pattern = cp.compile(STRING_PATTERN,
+ Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+ | Perl5Compiler.MULTILINE_MASK);
+ final Pattern pattern1 = cp.compile(URI_PATTERN,
+ Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+ | Perl5Compiler.MULTILINE_MASK);
+ final PatternMatcher matcher = new Perl5Matcher();
+
+ final PatternMatcher matcher1 = new Perl5Matcher();
+ final PatternMatcherInput input = new PatternMatcherInput(plainText);
+
+ MatchResult result;
+ String url;
+
+ //loop the matches
+ while (matcher.contains(input, pattern)) {
+ result = matcher.getMatch();
+ url = result.group(2);
+ PatternMatcherInput input1 = new PatternMatcherInput(url);
+ if (!matcher1.matches(input1, pattern1)) {
+ //LOG.fine(" - invalid '" + url + "'");
+ continue;
+ }
+ if (url.startsWith("www.")) {
+ url = "http://" + url;
+ } else url = new URL(baseURL, url).toString();
+ url = url.replaceAll("&", "&");
+ LOG.fine(" - outlink from JS: '" + url + "'");
+ outlinks.add(new Outlink(url, anchor));
+ }
+ } catch (Exception ex) {
+ // if it is a malformed URL we just throw it away and continue with
+ // extraction.
+ LOG.throwing(JSParseFilter.class.getName(), "getJSLinks", ex);
+ }
+
+ final Outlink[] retval;
+
+ //create array of the Outlinks
+ if (outlinks != null && outlinks.size() > 0) {
+ retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
+ } else {
+ retval = new Outlink[0];
+ }
+
+ return retval;
+ }
+
+ public static void main(String[] args) throws Exception {
+ if (args.length < 2) {
+ System.err.println(JSParseFilter.class.getName() + " file.js baseURL");
+ return;
+ }
+ InputStream in = new FileInputStream(args[0]);
+ BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
+ StringBuffer sb = new StringBuffer();
+ String line = null;
+ while ((line = br.readLine()) != null) sb.append(line + "\n");
+ Outlink[] links = getJSLinks(sb.toString(), args[1], args[1]);
+ System.out.println("Outlinks extracted: " + links.length);
+ for (int i = 0; i < links.length; i++)
+ System.out.println(" - " + links[i]);
+ }
+}
Propchange: incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: incubator/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java (original)
+++ incubator/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java Wed Jun 1 15:20:01 2005
@@ -18,6 +18,7 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
@@ -52,13 +53,13 @@
public MSWordParser () {}
- public Parse getParse(Content content) throws ParseException {
+ public Parse getParse(Content content) {
// check that contentType is one we can handle
String contentType = content.getContentType();
if (contentType != null && !contentType.startsWith("application/msword"))
- throw new ParseException(
- "Content-Type not application/msword: "+contentType);
+ return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
+ "Content-Type not application/msword: " + contentType).getEmptyParse();
String text = null;
String title = null;
@@ -71,8 +72,9 @@
String contentLength = content.get("Content-Length");
if (contentLength != null
&& raw.length != Integer.parseInt(contentLength)) {
- throw new ParseException("Content truncated at "+raw.length
- +" bytes. Parser can't handle incomplete msword file.");
+ return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
+ "Content truncated at " + raw.length
+ +" bytes. Parser can't handle incomplete msword file.").getEmptyParse();
}
WordExtractor extractor = new WordExtractor();
@@ -86,13 +88,14 @@
extractor = null;
} catch (ParseException e) {
- throw e;
+ return new ParseStatus(e).getEmptyParse();
} catch (FastSavedException e) {
- throw new ParseException(e);
+ return new ParseStatus(e).getEmptyParse();
} catch (PasswordProtectedException e) {
- throw new ParseException(e);
+ return new ParseStatus(e).getEmptyParse();
} catch (Exception e) { // run time exception
- throw new ParseException("Can't be handled as msword document. "+e);
+ return new ParseStatus(ParseStatus.FAILED,
+ "Can't be handled as msword document. " + e).getEmptyParse();
} finally {
// nothing so far
}
@@ -116,7 +119,7 @@
// collect outlink
Outlink[] outlinks = new Outlink[0];
- ParseData parseData = new ParseData(title, outlinks, metadata);
+ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata);
return new ParseImpl(text, parseData);
// any filter?
//return HtmlParseFilters.filter(content, parse, root);
Modified: incubator/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java (original)
+++ incubator/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java Wed Jun 1 15:20:01 2005
@@ -64,7 +64,7 @@
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = ProtocolFactory.getProtocol(urlString);
- content = protocol.getContent(urlString);
+ content = protocol.getProtocolOutput(urlString).getContent();
parser = ParserFactory.getParser(content.getContentType(), urlString);
parse = parser.getParse(content);
Modified: incubator/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java (original)
+++ incubator/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java Wed Jun 1 15:20:01 2005
@@ -27,6 +27,7 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
@@ -79,13 +80,13 @@
rootLogger.addAppender(appender);
}
- public Parse getParse(Content content) throws ParseException {
+ public Parse getParse(Content content) {
// check that contentType is one we can handle
String contentType = content.getContentType();
if (contentType != null && !contentType.startsWith("application/pdf"))
- throw new ParseException(
- "Content-Type not application/pdf: "+contentType);
+ return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
+ "Content-Type not application/pdf: " + contentType).getEmptyParse();
// in memory representation of pdf file
PDDocument pdf = null;
@@ -100,8 +101,9 @@
String contentLength = content.get("Content-Length");
if (contentLength != null
&& raw.length != Integer.parseInt(contentLength)) {
- throw new ParseException("Content truncated at "+raw.length
- +" bytes. Parser can't handle incomplete pdf file.");
+ return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
+ "Content truncated at "+raw.length
+ +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
}
PDFParser parser = new PDFParser(
@@ -134,14 +136,15 @@
// formatDate(info.getCreationDate())
// formatDate(info.getModificationDate())
- } catch (ParseException e) {
- throw e;
} catch (CryptographyException e) {
- throw new ParseException("Error decrypting document. "+e);
+ return new ParseStatus(ParseStatus.FAILED,
+ "Error decrypting document. " + e).getEmptyParse();
} catch (InvalidPasswordException e) {
- throw new ParseException("Can't decrypt document. "+e);
+ return new ParseStatus(ParseStatus.FAILED,
+ "Can't decrypt document - invalid password. " + e).getEmptyParse();
} catch (Exception e) { // run time exception
- throw new ParseException("Can't be handled as pdf document. "+e);
+ return new ParseStatus(ParseStatus.FAILED,
+ "Can't be handled as pdf document. " + e).getEmptyParse();
} finally {
try {
if (pdf != null)
@@ -164,7 +167,7 @@
Properties metadata = new Properties();
metadata.putAll(content.getMetadata()); // copy through
- ParseData parseData = new ParseData(title, outlinks, metadata);
+ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata);
return new ParseImpl(text, parseData);
// any filter?
//return HtmlParseFilters.filter(content, parse, root);
Modified: incubator/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java (original)
+++ incubator/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java Wed Jun 1 15:20:01 2005
@@ -64,7 +64,7 @@
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = ProtocolFactory.getProtocol(urlString);
- content = protocol.getContent(urlString);
+ content = protocol.getProtocolOutput(urlString).getContent();
parser = ParserFactory.getParser(content.getContentType(), urlString);
parse = parser.getParse(content);
Modified: incubator/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java (original)
+++ incubator/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Wed Jun 1 15:20:01 2005
@@ -23,12 +23,12 @@
import org.apache.nutch.util.*;
public class TextParser implements Parser {
- public Parse getParse(Content content) throws ParseException {
+ public Parse getParse(Content content) {
// copy content meta data through
Properties metadata = new Properties();
metadata.putAll(content.getMetadata());
- ParseData parseData = new ParseData("", new Outlink[0], metadata);
+ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new Outlink[0], metadata);
String encoding =
StringUtil.parseCharacterEncoding(content.getContentType());
@@ -37,7 +37,7 @@
try { // try to use named encoding
text = new String(content.getContent(), encoding);
} catch (java.io.UnsupportedEncodingException e) {
- throw new ParseException(e);
+ return new ParseStatus(e).getEmptyParse();
}
} else {
// FIXME: implement charset detector. This code causes problem when
Modified: incubator/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java (original)
+++ incubator/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java Wed Jun 1 15:20:01 2005
@@ -17,24 +17,24 @@
package org.apache.nutch.protocol.file;
+import org.apache.nutch.db.Page;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.pagedb.FetchListEntry;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
import java.util.logging.Level;
import java.util.logging.Logger;
+import java.net.MalformedURLException;
import java.net.URL;
-import java.io.InputStream;
-// 20040528, xing, disabled for now
-//import java.io.Reader;
-import java.io.IOException;
-
/************************************
* File.java deals with file: scheme.
*
@@ -65,9 +65,20 @@
}
/** Set the point at which content is truncated. */
- public void setMaxContentLength(int length) {this.maxContentLength = length;}
+ public void setMaxContentLength(int length) {maxContentLength = length;}
- public Content getContent(String urlString) throws FileException {
+ public ProtocolOutput getProtocolOutput(String urlString) {
+ ProtocolOutput output = null;
+ try {
+ return getProtocolOutput(new FetchListEntry(true,
+ new Page(urlString, 1.0f), new String[0]));
+ } catch (MalformedURLException mue) {
+ return new ProtocolOutput(null, new ProtocolStatus(mue));
+ }
+ }
+
+ public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
+ String urlString = fle.getUrl().toString();
try {
URL url = new URL(urlString);
@@ -80,7 +91,7 @@
int code = response.getCode();
if (code == 200) { // got a good response
- return response.toContent(); // return it
+ return new ProtocolOutput(response.toContent()); // return it
} else if (code >= 300 && code < 400) { // handle redirect
if (redirects == MAX_REDIRECTS)
@@ -94,8 +105,8 @@
throw new FileError(code);
}
}
- } catch (IOException e) {
- throw new FileException(e);
+ } catch (Exception e) {
+ return new ProtocolOutput(null, new ProtocolStatus(e));
}
}
@@ -139,7 +150,7 @@
// set log level
LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
- Content content = file.getContent(urlString);
+ Content content = file.getProtocolOutput(urlString).getContent();
System.err.println("Content-Type: " + content.getContentType());
System.err.println("Content-Length: " + content.get("Content-Length"));
Modified: incubator/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java (original)
+++ incubator/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java Wed Jun 1 15:20:01 2005
@@ -19,22 +19,24 @@
import org.apache.commons.net.ftp.FTPFileEntryParser;
+import org.apache.nutch.db.Page;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.pagedb.FetchListEntry;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
import java.util.logging.Level;
import java.util.logging.Logger;
+import java.net.MalformedURLException;
import java.net.URL;
-import java.io.InputStream;
-// 20040528, xing, disabled for now
-//import java.io.Reader;
import java.io.IOException;
/************************************
@@ -91,13 +93,13 @@
}
/** Set the timeout. */
- public void setTimeout(int timeout) {
- this.timeout = timeout;
+ public void setTimeout(int to) {
+ timeout = to;
}
/** Set the point at which content is truncated. */
public void setMaxContentLength(int length) {
- this.maxContentLength = length;
+ maxContentLength = length;
}
/** Set followTalk */
@@ -110,7 +112,18 @@
this.keepConnection = keepConnection;
}
- public Content getContent(String urlString) throws FtpException {
+ public ProtocolOutput getProtocolOutput(String urlString) {
+ ProtocolOutput output = null;
+ try {
+ return getProtocolOutput(new FetchListEntry(true,
+ new Page(urlString, 1.0f), new String[0]));
+ } catch (MalformedURLException mue) {
+ return new ProtocolOutput(null, new ProtocolStatus(mue));
+ }
+ }
+
+ public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
+ String urlString = fle.getUrl().toString();
try {
URL url = new URL(urlString);
@@ -123,7 +136,7 @@
int code = response.getCode();
if (code == 200) { // got a good response
- return response.toContent(); // return it
+ return new ProtocolOutput(response.toContent()); // return it
} else if (code >= 300 && code < 400) { // handle redirect
if (redirects == MAX_REDIRECTS)
@@ -137,8 +150,8 @@
throw new FtpError(code);
}
}
- } catch (IOException e) {
- throw new FtpException(e);
+ } catch (Exception e) {
+ return new ProtocolOutput(null, new ProtocolStatus(e));
}
}
@@ -205,7 +218,7 @@
// set log level
LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
- Content content = ftp.getContent(urlString);
+ Content content = ftp.getProtocolOutput(urlString).getContent();
System.err.println("Content-Type: " + content.getContentType());
System.err.println("Content-Length: " + content.get("Content-Length"));
Modified: incubator/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java (original)
+++ incubator/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java Wed Jun 1 15:20:01 2005
@@ -16,7 +16,7 @@
package org.apache.nutch.protocol.http;
-import java.io.*;
+import java.net.MalformedURLException;
import java.net.URL;
import java.net.InetAddress;
import java.net.UnknownHostException;
@@ -28,6 +28,8 @@
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.db.Page;
+import org.apache.nutch.pagedb.FetchListEntry;
import org.apache.nutch.protocol.*;
/** An implementation of the Http protocol. */
@@ -170,7 +172,18 @@
}
}
- public Content getContent(String urlString) throws ProtocolException {
+ public ProtocolOutput getProtocolOutput(String urlString) {
+ ProtocolOutput output = null;
+ try {
+ return getProtocolOutput(new FetchListEntry(true,
+ new Page(urlString, 1.0f), new String[0]));
+ } catch (MalformedURLException mue) {
+ return new ProtocolOutput(null, new ProtocolStatus(mue));
+ }
+ }
+
+ public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
+ String urlString = fle.getUrl().toString();
try {
URL url = new URL(urlString);
@@ -191,7 +204,7 @@
int code = response.getCode();
if (code == 200) { // got a good response
- return response.toContent(); // return it
+ return new ProtocolOutput(response.toContent()); // return it
} else if (code == 410) { // page is gone
throw new ResourceGone(url, "Http: " + code);
@@ -207,8 +220,8 @@
throw new HttpError(code);
}
}
- } catch (IOException e) {
- throw new HttpException(e);
+ } catch (Exception e) {
+ return new ProtocolOutput(null, new ProtocolStatus(e));
}
}
@@ -285,7 +298,7 @@
LOG.setLevel(Level.FINE);
}
- Content content = http.getContent(url);
+ Content content = http.getProtocolOutput(url).getContent();
System.out.println("Content Type: " + content.getContentType());
System.out.println("Content Length: " + content.get("Content-Length"));
Added: incubator/nutch/trunk/src/plugin/protocol-httpclient/build.xml
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/protocol-httpclient/build.xml?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/plugin/protocol-httpclient/build.xml (added)
+++ incubator/nutch/trunk/src/plugin/protocol-httpclient/build.xml Wed Jun 1 15:20:01 2005
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+
+<project name="protocol-httpclient" default="jar">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
Propchange: incubator/nutch/trunk/src/plugin/protocol-httpclient/build.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-codec.jar
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-codec.jar?rev=179436&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-codec.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: incubator/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-httpclient-3.0-rc2.jar
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-httpclient-3.0-rc2.jar?rev=179436&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-httpclient-3.0-rc2.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: incubator/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml (added)
+++ incubator/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml Wed Jun 1 15:20:01 2005
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+ id="protocol-httpclient"
+ name="Http / Https Protocol Plug-in"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <extension-point
+ id="org.apache.nutch.protocol.Protocol"
+ name="Nutch Protocol"/>
+
+ <runtime>
+ <library name="protocol-httpclient.jar">
+ <export name="*"/>
+ </library>
+ <library name="commons-codec.jar" />
+ <library name="commons-httpclient-3.0-rc2.jar" />
+
+ </runtime>
+
+ <extension id="org.apache.nutch.protocol.httpclient"
+ name="HttpProtocol"
+ point="org.apache.nutch.protocol.Protocol">
+
+ <implementation id="org.apache.nutch.protocol.httpclient.Http"
+ class="org.apache.nutch.protocol.httpclient.Http"
+ protocolName="http"/>
+
+ </extension>
+
+ <extension id="org.apache.nutch.protocol.https"
+ name="HttpsProtocol"
+ point="org.apache.nutch.protocol.Protocol">
+
+ <implementation id="org.apache.nutch.protocol.httpclient.Http"
+ class="org.apache.nutch.protocol.httpclient.Http"
+ protocolName="https"/>
+
+ </extension>
+
+</plugin>
Propchange: incubator/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java (added)
+++ incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java Wed Jun 1 15:20:01 2005
@@ -0,0 +1,129 @@
+/*
+ * Based on EasySSLProtocolSocketFactory from commons-httpclient:
+ *
+ * $Header:
+ * /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons//httpclient/src/contrib/org/apache/commons/httpclient/contrib/ssl/DummySSLProtocolSocketFactory.java,v
+ * 1.7 2004/06/11 19:26:27 olegk Exp $ $Revision$ $Date: 2005-02-26 05:01:52
+ * -0800 (Sat, 26 Feb 2005) $
+ */
+
+package org.apache.nutch.protocol.httpclient;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.Socket;
+import java.net.UnknownHostException;
+
+import org.apache.commons.httpclient.ConnectTimeoutException;
+import org.apache.commons.httpclient.HttpClientError;
+import org.apache.commons.httpclient.params.HttpConnectionParams;
+import org.apache.commons.httpclient.protocol.ControllerThreadSocketFactory;
+import org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import com.sun.net.ssl.SSLContext;
+import com.sun.net.ssl.TrustManager;
+
+public class DummySSLProtocolSocketFactory implements SecureProtocolSocketFactory {
+
+ /** Log object for this class. */
+ private static final Log LOG = LogFactory.getLog(DummySSLProtocolSocketFactory.class);
+
+ private SSLContext sslcontext = null;
+
+ /**
+ * Constructor for DummySSLProtocolSocketFactory.
+ */
+ public DummySSLProtocolSocketFactory() {
+ super();
+ }
+
+ private static SSLContext createEasySSLContext() {
+ try {
+ SSLContext context = SSLContext.getInstance("SSL");
+ context.init(null, new TrustManager[] { new DummyX509TrustManager(null) }, null);
+ return context;
+ } catch (Exception e) {
+ LOG.error(e.getMessage(), e);
+ throw new HttpClientError(e.toString());
+ }
+ }
+
+ private SSLContext getSSLContext() {
+ if (this.sslcontext == null) {
+ this.sslcontext = createEasySSLContext();
+ }
+ return this.sslcontext;
+ }
+
+ /**
+ * @see SecureProtocolSocketFactory#createSocket(java.lang.String,int,java.net.InetAddress,int)
+ */
+ public Socket createSocket(String host, int port, InetAddress clientHost, int clientPort) throws IOException,
+ UnknownHostException {
+
+ return getSSLContext().getSocketFactory().createSocket(host, port, clientHost, clientPort);
+ }
+
+ /**
+ * Attempts to get a new socket connection to the given host within the given
+ * time limit.
+ * <p>
+ * To circumvent the limitations of older JREs that do not support connect
+ * timeout a controller thread is executed. The controller thread attempts to
+ * create a new socket within the given limit of time. If socket constructor
+ * does not return until the timeout expires, the controller terminates and
+ * throws an {@link ConnectTimeoutException}
+ * </p>
+ *
+ * @param host the host name/IP
+ * @param port the port on the host
+ * @param clientHost the local host name/IP to bind the socket to
+ * @param clientPort the port on the local machine
+ * @param params {@link HttpConnectionParams Http connection parameters}
+ *
+ * @return Socket a new socket
+ *
+ * @throws IOException if an I/O error occurs while creating the socket
+ * @throws UnknownHostException if the IP address of the host cannot be
+ * determined
+ */
+ public Socket createSocket(final String host, final int port, final InetAddress localAddress, final int localPort,
+ final HttpConnectionParams params) throws IOException, UnknownHostException, ConnectTimeoutException {
+ if (params == null) {
+ throw new IllegalArgumentException("Parameters may not be null");
+ }
+ int timeout = params.getConnectionTimeout();
+ if (timeout == 0) {
+ return createSocket(host, port, localAddress, localPort);
+ } else {
+ // To be eventually deprecated when migrated to Java 1.4 or above
+ return ControllerThreadSocketFactory.createSocket(this, host, port, localAddress, localPort, timeout);
+ }
+ }
+
+ /**
+ * @see SecureProtocolSocketFactory#createSocket(java.lang.String,int)
+ */
+ public Socket createSocket(String host, int port) throws IOException, UnknownHostException {
+ return getSSLContext().getSocketFactory().createSocket(host, port);
+ }
+
+ /**
+ * @see SecureProtocolSocketFactory#createSocket(java.net.Socket,java.lang.String,int,boolean)
+ */
+ public Socket createSocket(Socket socket, String host, int port, boolean autoClose) throws IOException,
+ UnknownHostException {
+ return getSSLContext().getSocketFactory().createSocket(socket, host, port, autoClose);
+ }
+
+ public boolean equals(Object obj) {
+ return ((obj != null) && obj.getClass().equals(DummySSLProtocolSocketFactory.class));
+ }
+
+ public int hashCode() {
+ return DummySSLProtocolSocketFactory.class.hashCode();
+ }
+
+}
Propchange: incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
------------------------------------------------------------------------------
svn:eol-style = native