You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:49:19 UTC
[35/69] [abbrv] [partial] nutch git commit: Re arranged the source
code as per maven conventions for build
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
new file mode 100644
index 0000000..7c0d71b
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import java.lang.ClassLoader;
+import java.lang.InstantiationException;
+import java.util.HashMap;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.extractors.*;
+
+class BoilerpipeExtractorRepository {
+
+ public static final Log LOG = LogFactory.getLog(BoilerpipeExtractorRepository.class);
+ public static final HashMap<String, BoilerpipeExtractor> extractorRepository = new HashMap<String, BoilerpipeExtractor>();
+
+ /**
+ * Returns an instance of the specified extractor
+ */
+ public static synchronized BoilerpipeExtractor getExtractor(String boilerpipeExtractorName) {
+ // Check if there's no instance of this extractor
+ if (!extractorRepository.containsKey(boilerpipeExtractorName)) {
+ // FQCN
+ boilerpipeExtractorName = "de.l3s.boilerpipe.extractors." + boilerpipeExtractorName;
+
+ // Attempt to load the class
+ try {
+ ClassLoader loader = BoilerpipeExtractor.class.getClassLoader();
+ Class extractorClass = loader.loadClass(boilerpipeExtractorName);
+
+ // Add an instance to the repository
+ extractorRepository.put(boilerpipeExtractorName, (BoilerpipeExtractor)extractorClass.newInstance());
+
+ } catch (ClassNotFoundException e) {
+ LOG.error("BoilerpipeExtractor " + boilerpipeExtractorName + " not found!");
+ } catch (InstantiationException e) {
+ LOG.error("Could not instantiate " + boilerpipeExtractorName);
+ } catch (Exception e) {
+ LOG.error(e);
+ }
+ }
+
+ return extractorRepository.get(boilerpipeExtractorName);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMBuilder.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMBuilder.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMBuilder.java
new file mode 100644
index 0000000..77a1044
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMBuilder.java
@@ -0,0 +1,794 @@
+/*
+ * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
+ * XXX distribution, org.apache.xml.utils.DOMBuilder, in order to
+ * avoid dependency on Xalan.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Id: DOMBuilder.java 823614 2009-10-09 17:02:32Z ab $
+ */
+package org.apache.nutch.parse.tika;
+
+import java.util.Stack;
+
+import org.w3c.dom.Comment;
+import org.w3c.dom.Document;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.Text;
+import org.w3c.dom.CDATASection;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.ext.LexicalHandler;
+
+/**
+ * This class takes SAX events (in addition to some extra events that SAX
+ * doesn't handle yet) and adds the result to a document or document fragment.
+ */
+class DOMBuilder implements ContentHandler, LexicalHandler {
+ private boolean upperCaseElementNames = true;
+
+ /** Root document */
+ public Document m_doc;
+
+ /** Current node */
+ protected Node m_currentNode = null;
+
+ /** First node of document fragment or null if not a DocumentFragment */
+ public DocumentFragment m_docFrag = null;
+
+ /** Vector of element nodes */
+ protected Stack<Element> m_elemStack = new Stack<Element>();
+
+ /**
+ * Element recorded with this namespace will be converted to Node without a
+ * namespace
+ */
+ private String defaultNamespaceURI = null;
+
+ /**
+ * DOMBuilder instance constructor... it will add the DOM nodes to the
+ * document fragment.
+ *
+ * @param doc
+ * Root document
+ * @param node
+ * Current node
+ */
+ DOMBuilder(Document doc, Node node) {
+ m_doc = doc;
+ m_currentNode = node;
+ }
+
+ /**
+ * DOMBuilder instance constructor... it will add the DOM nodes to the
+ * document fragment.
+ *
+ * @param doc
+ * Root document
+ * @param docFrag
+ * Document fragment
+ */
+ DOMBuilder(Document doc, DocumentFragment docFrag) {
+ m_doc = doc;
+ m_docFrag = docFrag;
+ }
+
+ /**
+ * DOMBuilder instance constructor... it will add the DOM nodes to the
+ * document.
+ *
+ * @param doc
+ * Root document
+ */
+ DOMBuilder(Document doc) {
+ m_doc = doc;
+ }
+
+ /**
+ * Get the root node of the DOM being created. This is either a Document or a
+ * DocumentFragment.
+ *
+ * @return The root document or document fragment if not null
+ */
+ Node getRootNode() {
+ return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc;
+ }
+
+ /**
+ * Get the node currently being processed.
+ *
+ * @return the current node being processed
+ */
+ Node getCurrentNode() {
+ return m_currentNode;
+ }
+
+ /**
+ * Return null since there is no Writer for this class.
+ *
+ * @return null
+ */
+ java.io.Writer getWriter() {
+ return null;
+ }
+
+ /**
+ * Append a node to the current container.
+ *
+ * @param newNode
+ * New node to append
+ */
+ protected void append(Node newNode) throws org.xml.sax.SAXException {
+
+ Node currentNode = m_currentNode;
+
+ if (null != currentNode) {
+ currentNode.appendChild(newNode);
+
+ // System.out.println(newNode.getNodeName());
+ } else if (null != m_docFrag) {
+ m_docFrag.appendChild(newNode);
+ } else {
+ boolean ok = true;
+ short type = newNode.getNodeType();
+
+ if (type == Node.TEXT_NODE) {
+ String data = newNode.getNodeValue();
+
+ if ((null != data) && (data.trim().length() > 0)) {
+ throw new org.xml.sax.SAXException(
+ "Warning: can't output text before document element! Ignoring...");
+ }
+
+ ok = false;
+ } else if (type == Node.ELEMENT_NODE) {
+ if (m_doc.getDocumentElement() != null) {
+ throw new org.xml.sax.SAXException(
+ "Can't have more than one root on a DOM!");
+ }
+ }
+
+ if (ok)
+ m_doc.appendChild(newNode);
+ }
+ }
+
+ /**
+ * Receive an object for locating the origin of SAX document events.
+ *
+ * <p>
+ * SAX parsers are strongly encouraged (though not absolutely required) to
+ * supply a locator: if it does so, it must supply the locator to the
+ * application by invoking this method before invoking any of the other
+ * methods in the ContentHandler interface.
+ * </p>
+ *
+ * <p>
+ * The locator allows the application to determine the end position of any
+ * document-related event, even if the parser is not reporting an error.
+ * Typically, the application will use this information for reporting its own
+ * errors (such as character content that does not match an application's
+ * business rules). The information returned by the locator is probably not
+ * sufficient for use with a search engine.
+ * </p>
+ *
+ * <p>
+ * Note that the locator will return correct information only during the
+ * invocation of the events in this interface. The application should not
+ * attempt to use it at any other time.
+ * </p>
+ *
+ * @param locator
+ * An object that can return the location of any SAX document event.
+ * @see org.xml.sax.Locator
+ */
+ public void setDocumentLocator(Locator locator) {
+
+ // No action for the moment.
+ }
+
+ /**
+ * Receive notification of the beginning of a document.
+ *
+ * <p>
+ * The SAX parser will invoke this method only once, before any other methods
+ * in this interface or in DTDHandler (except for setDocumentLocator).
+ * </p>
+ */
+ public void startDocument() throws org.xml.sax.SAXException {
+
+ // No action for the moment.
+ }
+
+ /**
+ * Receive notification of the end of a document.
+ *
+ * <p>
+ * The SAX parser will invoke this method only once, and it will be the last
+ * method invoked during the parse. The parser shall not invoke this method
+ * until it has either abandoned parsing (because of an unrecoverable error)
+ * or reached the end of input.
+ * </p>
+ */
+ public void endDocument() throws org.xml.sax.SAXException {
+
+ // No action for the moment.
+ }
+
+ /**
+ * Receive notification of the beginning of an element.
+ *
+ * <p>
+ * The Parser will invoke this method at the beginning of every element in the
+ * XML document; there will be a corresponding endElement() event for every
+ * startElement() event (even when the element is empty). All of the element's
+ * content will be reported, in order, before the corresponding endElement()
+ * event.
+ * </p>
+ *
+ * <p>
+ * If the element name has a namespace prefix, the prefix will still be
+ * attached. Note that the attribute list provided will contain only
+ * attributes with explicit values (specified or defaulted): #IMPLIED
+ * attributes will be omitted.
+ * </p>
+ *
+ *
+ * @param ns
+ * The namespace of the node
+ * @param localName
+ * The local part of the qualified name
+ * @param name
+ * The element name.
+ * @param atts
+ * The attributes attached to the element, if any.
+ * @see #endElement
+ * @see org.xml.sax.Attributes
+ */
+ public void startElement(String ns, String localName, String name,
+ Attributes atts) throws org.xml.sax.SAXException {
+
+ Element elem;
+
+ if (upperCaseElementNames)
+ name = name.toUpperCase();
+
+ // Note that the namespace-aware call must be used to correctly
+ // construct a Level 2 DOM, even for non-namespaced nodes.
+ if ((null == ns) || (ns.length() == 0) || ns.equals(defaultNamespaceURI))
+ elem = m_doc.createElementNS(null, name);
+ else
+ elem = m_doc.createElementNS(ns, name);
+
+ append(elem);
+
+ try {
+ int nAtts = atts.getLength();
+
+ if (0 != nAtts) {
+ for (int i = 0; i < nAtts; i++) {
+
+ // System.out.println("type " + atts.getType(i) + " name " +
+ // atts.getLocalName(i) );
+ // First handle a possible ID attribute
+ if (atts.getType(i).equalsIgnoreCase("ID"))
+ setIDAttribute(atts.getValue(i), elem);
+
+ String attrNS = atts.getURI(i);
+
+ if ("".equals(attrNS))
+ attrNS = null; // DOM represents no-namespace as null
+
+ // System.out.println("attrNS: "+attrNS+", localName: "+atts.getQName(i)
+ // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i));
+ // Crimson won't let us set an xmlns: attribute on the DOM.
+ String attrQName = atts.getQName(i);
+
+ // In SAX, xmlns: attributes have an empty namespace, while in DOM
+ // they should have the xmlns namespace
+ if (attrQName.startsWith("xmlns:"))
+ attrNS = "http://www.w3.org/2000/xmlns/";
+
+ // ALWAYS use the DOM Level 2 call!
+ elem.setAttributeNS(attrNS, attrQName, atts.getValue(i));
+ }
+ }
+
+ // append(elem);
+
+ m_elemStack.push(elem);
+
+ m_currentNode = elem;
+
+ // append(elem);
+ } catch (java.lang.Exception de) {
+ // de.printStackTrace();
+ throw new org.xml.sax.SAXException(de);
+ }
+
+ }
+
+ /**
+ *
+ *
+ *
+ * Receive notification of the end of an element.
+ *
+ * <p>
+ * The SAX parser will invoke this method at the end of every element in the
+ * XML document; there will be a corresponding startElement() event for every
+ * endElement() event (even when the element is empty).
+ * </p>
+ *
+ * <p>
+ * If the element name has a namespace prefix, the prefix will still be
+ * attached to the name.
+ * </p>
+ *
+ *
+ * @param ns
+ * the namespace of the element
+ * @param localName
+ * The local part of the qualified name of the element
+ * @param name
+ * The element name
+ */
+ public void endElement(String ns, String localName, String name)
+ throws org.xml.sax.SAXException {
+ if (!m_elemStack.isEmpty()) {
+ m_elemStack.pop();
+ }
+ m_currentNode = m_elemStack.isEmpty() ? null : (Node) m_elemStack.peek();
+ }
+
+ /**
+ * Set an ID string to node association in the ID table.
+ *
+ * @param id
+ * The ID string.
+ * @param elem
+ * The associated ID.
+ */
+ public void setIDAttribute(String id, Element elem) {
+
+ // Do nothing. This method is meant to be overiden.
+ }
+
+ /**
+ * Receive notification of character data.
+ *
+ * <p>
+ * The Parser will call this method to report each chunk of character data.
+ * SAX parsers may return all contiguous character data in a single chunk, or
+ * they may split it into several chunks; however, all of the characters in
+ * any single event must come from the same external entity, so that the
+ * Locator provides useful information.
+ * </p>
+ *
+ * <p>
+ * The application must not attempt to read from the array outside of the
+ * specified range.
+ * </p>
+ *
+ * <p>
+ * Note that some parsers will report whitespace using the
+ * ignorableWhitespace() method rather than this one (validating parsers must
+ * do so).
+ * </p>
+ *
+ * @param ch
+ * The characters from the XML document.
+ * @param start
+ * The start position in the array.
+ * @param length
+ * The number of characters to read from the array.
+ * @see #ignorableWhitespace
+ * @see org.xml.sax.Locator
+ */
+ public void characters(char ch[], int start, int length)
+ throws org.xml.sax.SAXException {
+ if (isOutsideDocElem()
+ && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+ return; // avoid DOM006 Hierarchy request error
+
+ if (m_inCData) {
+ cdata(ch, start, length);
+
+ return;
+ }
+
+ String s = new String(ch, start, length);
+ Node childNode;
+ childNode = m_currentNode != null ? m_currentNode.getLastChild() : null;
+ if (childNode != null && childNode.getNodeType() == Node.TEXT_NODE) {
+ ((Text) childNode).appendData(s);
+ } else {
+ Text text = m_doc.createTextNode(s);
+ append(text);
+ }
+ }
+
+ /**
+ * If available, when the disable-output-escaping attribute is used, output
+ * raw text without escaping. A PI will be inserted in front of the node with
+ * the name "lotusxsl-next-is-raw" and a value of "formatter-to-dom".
+ *
+ * @param ch
+ * Array containing the characters
+ * @param start
+ * Index to start of characters in the array
+ * @param length
+ * Number of characters in the array
+ */
+ public void charactersRaw(char ch[], int start, int length)
+ throws org.xml.sax.SAXException {
+ if (isOutsideDocElem()
+ && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+ return; // avoid DOM006 Hierarchy request error
+
+ String s = new String(ch, start, length);
+
+ append(m_doc.createProcessingInstruction("xslt-next-is-raw",
+ "formatter-to-dom"));
+ append(m_doc.createTextNode(s));
+ }
+
+ /**
+ * Report the beginning of an entity.
+ *
+ * The start and end of the document entity are not reported. The start and
+ * end of the external DTD subset are reported using the pseudo-name "[dtd]".
+ * All other events must be properly nested within start/end entity events.
+ *
+ * @param name
+ * The name of the entity. If it is a parameter entity, the name will
+ * begin with '%'.
+ * @see #endEntity
+ * @see org.xml.sax.ext.DeclHandler#internalEntityDecl
+ * @see org.xml.sax.ext.DeclHandler#externalEntityDecl
+ */
+ public void startEntity(String name) throws org.xml.sax.SAXException {
+
+ // Almost certainly the wrong behavior...
+ // entityReference(name);
+ }
+
+ /**
+ * Report the end of an entity.
+ *
+ * @param name
+ * The name of the entity that is ending.
+ * @see #startEntity
+ */
+ public void endEntity(String name) throws org.xml.sax.SAXException {
+ }
+
+ /**
+ * Receive notivication of a entityReference.
+ *
+ * @param name
+ * name of the entity reference
+ */
+ public void entityReference(String name) throws org.xml.sax.SAXException {
+ append(m_doc.createEntityReference(name));
+ }
+
+ /**
+ * Receive notification of ignorable whitespace in element content.
+ *
+ * <p>
+ * Validating Parsers must use this method to report each chunk of ignorable
+ * whitespace (see the W3C XML 1.0 recommendation, section 2.10):
+ * non-validating parsers may also use this method if they are capable of
+ * parsing and using content models.
+ * </p>
+ *
+ * <p>
+ * SAX parsers may return all contiguous whitespace in a single chunk, or they
+ * may split it into several chunks; however, all of the characters in any
+ * single event must come from the same external entity, so that the Locator
+ * provides useful information.
+ * </p>
+ *
+ * <p>
+ * The application must not attempt to read from the array outside of the
+ * specified range.
+ * </p>
+ *
+ * @param ch
+ * The characters from the XML document.
+ * @param start
+ * The start position in the array.
+ * @param length
+ * The number of characters to read from the array.
+ * @see #characters
+ */
+ public void ignorableWhitespace(char ch[], int start, int length)
+ throws org.xml.sax.SAXException {
+ if (isOutsideDocElem())
+ return; // avoid DOM006 Hierarchy request error
+
+ String s = new String(ch, start, length);
+
+ append(m_doc.createTextNode(s));
+ }
+
+ /**
+ * Tell if the current node is outside the document element.
+ *
+ * @return true if the current node is outside the document element.
+ */
+ private boolean isOutsideDocElem() {
+ return (null == m_docFrag)
+ && m_elemStack.size() == 0
+ && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE);
+ }
+
+ /**
+ * Receive notification of a processing instruction.
+ *
+ * <p>
+ * The Parser will invoke this method once for each processing instruction
+ * found: note that processing instructions may occur before or after the main
+ * document element.
+ * </p>
+ *
+ * <p>
+ * A SAX parser should never report an XML declaration (XML 1.0, section 2.8)
+ * or a text declaration (XML 1.0, section 4.3.1) using this method.
+ * </p>
+ *
+ * @param target
+ * The processing instruction target.
+ * @param data
+ * The processing instruction data, or null if none was supplied.
+ */
+ public void processingInstruction(String target, String data)
+ throws org.xml.sax.SAXException {
+ append(m_doc.createProcessingInstruction(target, data));
+ }
+
+ /**
+ * Report an XML comment anywhere in the document.
+ *
+ * This callback will be used for comments inside or outside the document
+ * element, including comments in the external DTD subset (if read).
+ *
+ * @param ch
+ * An array holding the characters in the comment.
+ * @param start
+ * The starting position in the array.
+ * @param length
+ * The number of characters to use from the array.
+ */
+ public void comment(char ch[], int start, int length)
+ throws org.xml.sax.SAXException {
+ // tagsoup sometimes submits invalid values here
+ if (ch == null || start < 0 || length >= (ch.length - start) || length < 0)
+ return;
+ append(m_doc.createComment(new String(ch, start, length)));
+ }
+
+ /** Flag indicating that we are processing a CData section */
+ protected boolean m_inCData = false;
+
+ /**
+ * Report the start of a CDATA section.
+ *
+ * @see #endCDATA
+ */
+ public void startCDATA() throws org.xml.sax.SAXException {
+ m_inCData = true;
+ append(m_doc.createCDATASection(""));
+ }
+
+ /**
+ * Report the end of a CDATA section.
+ *
+ * @see #startCDATA
+ */
+ public void endCDATA() throws org.xml.sax.SAXException {
+ m_inCData = false;
+ }
+
+ /**
+ * Receive notification of cdata.
+ *
+ * <p>
+ * The Parser will call this method to report each chunk of character data.
+ * SAX parsers may return all contiguous character data in a single chunk, or
+ * they may split it into several chunks; however, all of the characters in
+ * any single event must come from the same external entity, so that the
+ * Locator provides useful information.
+ * </p>
+ *
+ * <p>
+ * The application must not attempt to read from the array outside of the
+ * specified range.
+ * </p>
+ *
+ * <p>
+ * Note that some parsers will report whitespace using the
+ * ignorableWhitespace() method rather than this one (validating parsers must
+ * do so).
+ * </p>
+ *
+ * @param ch
+ * The characters from the XML document.
+ * @param start
+ * The start position in the array.
+ * @param length
+ * The number of characters to read from the array.
+ * @see #ignorableWhitespace
+ * @see org.xml.sax.Locator
+ */
+ public void cdata(char ch[], int start, int length)
+ throws org.xml.sax.SAXException {
+ if (isOutsideDocElem()
+ && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+ return; // avoid DOM006 Hierarchy request error
+
+ String s = new String(ch, start, length);
+
+ // XXX ab@apache.org: modified from the original, to accomodate TagSoup.
+ Node n = m_currentNode.getLastChild();
+ if (n instanceof CDATASection)
+ ((CDATASection) n).appendData(s);
+ else if (n instanceof Comment)
+ ((Comment) n).appendData(s);
+ }
+
+ /**
+ * Report the start of DTD declarations, if any.
+ *
+ * Any declarations are assumed to be in the internal subset unless otherwise
+ * indicated.
+ *
+ * @param name
+ * The document type name.
+ * @param publicId
+ * The declared public identifier for the external DTD subset, or
+ * null if none was declared.
+ * @param systemId
+ * The declared system identifier for the external DTD subset, or
+ * null if none was declared.
+ * @see #endDTD
+ * @see #startEntity
+ */
+ public void startDTD(String name, String publicId, String systemId)
+ throws org.xml.sax.SAXException {
+
+ // Do nothing for now.
+ }
+
+ /**
+ * Report the end of DTD declarations.
+ *
+ * @see #startDTD
+ */
+ public void endDTD() throws org.xml.sax.SAXException {
+
+ // Do nothing for now.
+ }
+
+ /**
+ * Begin the scope of a prefix-URI Namespace mapping.
+ *
+ * <p>
+ * The information from this event is not necessary for normal Namespace
+ * processing: the SAX XML reader will automatically replace prefixes for
+ * element and attribute names when the http://xml.org/sax/features/namespaces
+ * feature is true (the default).
+ * </p>
+ *
+ * <p>
+ * There are cases, however, when applications need to use prefixes in
+ * character data or in attribute values, where they cannot safely be expanded
+ * automatically; the start/endPrefixMapping event supplies the information to
+ * the application to expand prefixes in those contexts itself, if necessary.
+ * </p>
+ *
+ * <p>
+ * Note that start/endPrefixMapping events are not guaranteed to be properly
+ * nested relative to each-other: all startPrefixMapping events will occur
+ * before the corresponding startElement event, and all endPrefixMapping
+ * events will occur after the corresponding endElement event, but their order
+ * is not guaranteed.
+ * </p>
+ *
+ * @param prefix
+ * The Namespace prefix being declared.
+ * @param uri
+ * The Namespace URI the prefix is mapped to.
+ * @see #endPrefixMapping
+ * @see #startElement
+ */
+ public void startPrefixMapping(String prefix, String uri)
+ throws org.xml.sax.SAXException {
+
+ /*
+ * // Not sure if this is needed or wanted // Also, it fails in the stree.
+ * if((null != m_currentNode) && (m_currentNode.getNodeType() ==
+ * Node.ELEMENT_NODE)) { String qname; if(((null != prefix) &&
+ * (prefix.length() == 0)) || (null == prefix)) qname = "xmlns"; else qname
+ * = "xmlns:"+prefix;
+ *
+ * Element elem = (Element)m_currentNode; String val =
+ * elem.getAttribute(qname); // Obsolete, should be DOM2...? if(val == null)
+ * { elem.setAttributeNS("http://www.w3.org/XML/1998/namespace", qname,
+ * uri); } }
+ */
+ }
+
+ /**
+ * End the scope of a prefix-URI mapping.
+ *
+ * <p>
+ * See startPrefixMapping for details. This event will always occur after the
+ * corresponding endElement event, but the order of endPrefixMapping events is
+ * not otherwise guaranteed.
+ * </p>
+ *
+ * @param prefix
+ * The prefix that was being mapping.
+ * @see #startPrefixMapping
+ * @see #endElement
+ */
+ public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException {
+ }
+
+ /**
+ * Receive notification of a skipped entity.
+ *
+ * <p>
+ * The Parser will invoke this method once for each entity skipped.
+ * Non-validating processors may skip entities if they have not seen the
+ * declarations (because, for example, the entity was declared in an external
+ * DTD subset). All processors may skip external entities, depending on the
+ * values of the http://xml.org/sax/features/external-general-entities and the
+ * http://xml.org/sax/features/external-parameter-entities properties.
+ * </p>
+ *
+ * @param name
+ * The name of the skipped entity. If it is a parameter entity, the
+ * name will begin with '%'.
+ */
+ public void skippedEntity(String name) throws org.xml.sax.SAXException {
+ }
+
+ public boolean isUpperCaseElementNames() {
+ return upperCaseElementNames;
+ }
+
+ public void setUpperCaseElementNames(boolean upperCaseElementNames) {
+ this.upperCaseElementNames = upperCaseElementNames;
+ }
+
+ public String getDefaultNamespaceURI() {
+ return defaultNamespaceURI;
+ }
+
+ public void setDefaultNamespaceURI(String defaultNamespaceURI) {
+ this.defaultNamespaceURI = defaultNamespaceURI;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMContentUtils.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMContentUtils.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMContentUtils.java
new file mode 100644
index 0000000..5c4c990
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMContentUtils.java
@@ -0,0 +1,402 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.tika;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.util.NodeWalker;
+import org.apache.nutch.util.URLUtil;
+import org.apache.tika.sax.Link;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+/**
+ * A collection of methods for extracting content from DOM trees.
+ *
+ * This class holds a few utility methods for pulling content out of DOM nodes,
+ * such as getOutlinks, getText, etc.
+ *
+ */
+public class DOMContentUtils {
+
+ private static class LinkParams {
+ private String elName;
+ private String attrName;
+ private int childLen;
+
+ private LinkParams(String elName, String attrName, int childLen) {
+ this.elName = elName;
+ this.attrName = attrName;
+ this.childLen = childLen;
+ }
+
+ public String toString() {
+ return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]";
+ }
+ }
+
+ private HashMap<String, LinkParams> linkParams = new HashMap<String, LinkParams>();
+ private HashSet<String> ignoredTags = new HashSet<String>();
+ private Configuration conf;
+
+ public DOMContentUtils(Configuration conf) {
+ setConf(conf);
+ }
+
+ public void setConf(Configuration conf) {
+ // forceTags is used to override configurable tag ignoring, later on
+ Collection<String> forceTags = new ArrayList<String>(1);
+
+ this.conf = conf;
+ linkParams.clear();
+ linkParams.put("a", new LinkParams("a", "href", 1));
+ linkParams.put("area", new LinkParams("area", "href", 0));
+ if (conf.getBoolean("parser.html.form.use_action", true)) {
+ linkParams.put("form", new LinkParams("form", "action", 1));
+ if (conf.get("parser.html.form.use_action") != null)
+ forceTags.add("form");
+ }
+ linkParams.put("frame", new LinkParams("frame", "src", 0));
+ linkParams.put("iframe", new LinkParams("iframe", "src", 0));
+ linkParams.put("script", new LinkParams("script", "src", 0));
+ linkParams.put("link", new LinkParams("link", "href", 0));
+ linkParams.put("img", new LinkParams("img", "src", 0));
+
+ // remove unwanted link tags from the linkParams map
+ String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags");
+ for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) {
+ ignoredTags.add(ignoreTags[i].toLowerCase());
+ if (!forceTags.contains(ignoreTags[i]))
+ linkParams.remove(ignoreTags[i]);
+ }
+ }
+
+ /**
+ * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
+ * append all the content text found beneath the DOM node to the
+ * <code>StringBuffer</code>.
+ *
+ * <p>
+ *
+ * If <code>abortOnNestedAnchors</code> is true, DOM traversal will be aborted
+ * and the <code>StringBuffer</code> will not contain any text encountered
+ * after a nested anchor is found.
+ *
+ * <p>
+ *
+ * @return true if nested anchors were found
+ */
+ private boolean getText(StringBuffer sb, Node node,
+ boolean abortOnNestedAnchors) {
+ if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * This is a convinience method, equivalent to
+ * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
+ *
+ */
+ public void getText(StringBuffer sb, Node node) {
+ getText(sb, node, false);
+ }
+
+ // returns true if abortOnNestedAnchors is true and we find nested
+ // anchors
+ private boolean getTextHelper(StringBuffer sb, Node node,
+ boolean abortOnNestedAnchors, int anchorDepth) {
+ boolean abort = false;
+ NodeWalker walker = new NodeWalker(node);
+
+ while (walker.hasNext()) {
+
+ Node currentNode = walker.nextNode();
+ String nodeName = currentNode.getNodeName();
+ short nodeType = currentNode.getNodeType();
+
+ if ("script".equalsIgnoreCase(nodeName)) {
+ walker.skipChildren();
+ }
+ if ("style".equalsIgnoreCase(nodeName)) {
+ walker.skipChildren();
+ }
+ if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
+ anchorDepth++;
+ if (anchorDepth > 1) {
+ abort = true;
+ break;
+ }
+ }
+ if (nodeType == Node.COMMENT_NODE) {
+ walker.skipChildren();
+ }
+ if (nodeType == Node.TEXT_NODE) {
+ // cleanup and trim the value
+ String text = currentNode.getNodeValue();
+ text = text.replaceAll("\\s+", " ");
+ text = text.trim();
+ if (text.length() > 0) {
+ if (sb.length() > 0)
+ sb.append(' ');
+ sb.append(text);
+ }
+ }
+ }
+
+ return abort;
+ }
+
+ /**
+ * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
+ * append the content text found beneath the first <code>title</code> node to
+ * the <code>StringBuffer</code>.
+ *
+ * @return true if a title node was found, false otherwise
+ */
+ public boolean getTitle(StringBuffer sb, Node node) {
+
+ NodeWalker walker = new NodeWalker(node);
+
+ while (walker.hasNext()) {
+
+ Node currentNode = walker.nextNode();
+ String nodeName = currentNode.getNodeName();
+ short nodeType = currentNode.getNodeType();
+
+ if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
+ return false;
+ }
+
+ if (nodeType == Node.ELEMENT_NODE) {
+ if ("title".equalsIgnoreCase(nodeName)) {
+ getText(sb, currentNode);
+ return true;
+ }
+ }
+ }
+
+ return false;
+ }
+
+ /** If Node contains a BASE tag then it's HREF is returned. */
+ URL getBase(Node node) {
+
+ NodeWalker walker = new NodeWalker(node);
+
+ while (walker.hasNext()) {
+
+ Node currentNode = walker.nextNode();
+ String nodeName = currentNode.getNodeName();
+ short nodeType = currentNode.getNodeType();
+
+ // is this node a BASE tag?
+ if (nodeType == Node.ELEMENT_NODE) {
+
+ if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
+ return null;
+ }
+
+ if ("base".equalsIgnoreCase(nodeName)) {
+ NamedNodeMap attrs = currentNode.getAttributes();
+ for (int i = 0; i < attrs.getLength(); i++) {
+ Node attr = attrs.item(i);
+ if ("href".equalsIgnoreCase(attr.getNodeName())) {
+ try {
+ return new URL(attr.getNodeValue());
+ } catch (MalformedURLException e) {
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // no.
+ return null;
+ }
+
+ private boolean hasOnlyWhiteSpace(Node node) {
+ String val = node.getNodeValue();
+ for (int i = 0; i < val.length(); i++) {
+ if (!Character.isWhitespace(val.charAt(i)))
+ return false;
+ }
+ return true;
+ }
+
+ // this only covers a few cases of empty links that are symptomatic
+ // of nekohtml's DOM-fixup process...
+ private boolean shouldThrowAwayLink(Node node, NodeList children,
+ int childLen, LinkParams params) {
+ if (childLen == 0) {
+ // this has no inner structure
+ if (params.childLen == 0)
+ return false;
+ else
+ return true;
+ } else if ((childLen == 1)
+ && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
+ && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
+ // single nested link
+ return true;
+
+ } else if (childLen == 2) {
+
+ Node c0 = children.item(0);
+ Node c1 = children.item(1);
+
+ if ((c0.getNodeType() == Node.ELEMENT_NODE)
+ && (params.elName.equalsIgnoreCase(c0.getNodeName()))
+ && (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) {
+ // single link followed by whitespace node
+ return true;
+ }
+
+ if ((c1.getNodeType() == Node.ELEMENT_NODE)
+ && (params.elName.equalsIgnoreCase(c1.getNodeName()))
+ && (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) {
+ // whitespace node followed by single link
+ return true;
+ }
+
+ } else if (childLen == 3) {
+ Node c0 = children.item(0);
+ Node c1 = children.item(1);
+ Node c2 = children.item(2);
+
+ if ((c1.getNodeType() == Node.ELEMENT_NODE)
+ && (params.elName.equalsIgnoreCase(c1.getNodeName()))
+ && (c0.getNodeType() == Node.TEXT_NODE)
+ && (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)
+ && hasOnlyWhiteSpace(c2)) {
+ // single link surrounded by whitespace nodes
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * This method finds all anchors below the supplied DOM <code>node</code>, and
+ * creates appropriate {@link Outlink} records for each (relative to the
+ * supplied <code>base</code> URL), and adds them to the <code>outlinks</code>
+ * {@link ArrayList}.
+ *
+ * <p>
+ *
+ * Links without inner structure (tags, text, etc) are discarded, as are links
+ * which contain only single nested links and empty text nodes (this is a
+ * common DOM-fixup artifact, at least with nekohtml).
+ */
+ public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
+
+ NodeWalker walker = new NodeWalker(node);
+ while (walker.hasNext()) {
+
+ Node currentNode = walker.nextNode();
+ String nodeName = currentNode.getNodeName();
+ short nodeType = currentNode.getNodeType();
+ NodeList children = currentNode.getChildNodes();
+ int childLen = (children != null) ? children.getLength() : 0;
+
+ if (nodeType == Node.ELEMENT_NODE) {
+
+ nodeName = nodeName.toLowerCase();
+ LinkParams params = (LinkParams) linkParams.get(nodeName);
+ if (params != null) {
+ if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
+
+ StringBuffer linkText = new StringBuffer();
+ getText(linkText, currentNode, true);
+
+ NamedNodeMap attrs = currentNode.getAttributes();
+ String target = null;
+ boolean noFollow = false;
+ boolean post = false;
+ for (int i = 0; i < attrs.getLength(); i++) {
+ Node attr = attrs.item(i);
+ String attrName = attr.getNodeName();
+ if (params.attrName.equalsIgnoreCase(attrName)) {
+ target = attr.getNodeValue();
+ } else if ("rel".equalsIgnoreCase(attrName)
+ && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+ noFollow = true;
+ } else if ("method".equalsIgnoreCase(attrName)
+ && "post".equalsIgnoreCase(attr.getNodeValue())) {
+ post = true;
+ }
+ }
+ if (target != null && !noFollow && !post)
+ try {
+
+ URL url = URLUtil.resolveURL(base, target);
+ outlinks.add(new Outlink(url.toString(), linkText.toString()
+ .trim()));
+ } catch (MalformedURLException e) {
+ // don't care
+ }
+ }
+ // this should not have any children, skip them
+ if (params.childLen == 0)
+ continue;
+ }
+ }
+ }
+ }
+
+ // This one is used by NUTCH-1918
+ public void getOutlinks(URL base, ArrayList<Outlink> outlinks, List<Link> tikaExtractedOutlinks) {
+ String target = null;
+ String anchor = null;
+ boolean noFollow = false;
+
+ for (Link link : tikaExtractedOutlinks) {
+ target = link.getUri();
+ noFollow = (link.getRel().toLowerCase().equals("nofollow")) ? true : false;
+ anchor = link.getText();
+
+ if (!ignoredTags.contains(link.getType())) {
+ if (target != null && !noFollow) {
+ try {
+ URL url = URLUtil.resolveURL(base, target);
+
+ // clean the anchor
+ anchor = anchor.replaceAll("\\s+", " ");
+ anchor = anchor.trim();
+
+ outlinks.add(new Outlink(url.toString(), anchor));
+ } catch (MalformedURLException e) {
+ // don't care
+ }
+ }
+ }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
new file mode 100644
index 0000000..294bde9
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
@@ -0,0 +1,214 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.tika;
+
+import java.net.URL;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.w3c.dom.*;
+
+/**
+ * Class for parsing META Directives from DOM trees. This class handles
+ * specifically Robots META directives (all, none, nofollow, noindex), finding
+ * BASE HREF tags, and HTTP-EQUIV no-cache instructions. All meta directives are
+ * stored in a HTMLMetaTags instance.
+ */
+public class HTMLMetaProcessor {
+
+ /**
+ * Utility class with indicators for the robots directives "noindex" and
+ * "nofollow", and HTTP-EQUIV/no-cache
+ */
+
+ /**
+ * Sets the indicators in <code>robotsMeta</code> to appropriate values, based
+ * on any META tags found under the given <code>node</code>.
+ */
+ public static final void getMetaTags(HTMLMetaTags metaTags, Node node,
+ URL currURL) {
+
+ metaTags.reset();
+ getMetaTagsHelper(metaTags, node, currURL);
+ }
+
+ private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node,
+ URL currURL) {
+
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+
+ if ("body".equalsIgnoreCase(node.getNodeName())) {
+ // META tags should not be under body
+ return;
+ }
+
+ if ("meta".equalsIgnoreCase(node.getNodeName())) {
+ NamedNodeMap attrs = node.getAttributes();
+ Node nameNode = null;
+ Node equivNode = null;
+ Node contentNode = null;
+ // Retrieves name, http-equiv and content attribues
+ for (int i = 0; i < attrs.getLength(); i++) {
+ Node attr = attrs.item(i);
+ String attrName = attr.getNodeName().toLowerCase();
+ if (attrName.equals("name")) {
+ nameNode = attr;
+ } else if (attrName.equals("http-equiv")) {
+ equivNode = attr;
+ } else if (attrName.equals("content")) {
+ contentNode = attr;
+ }
+ }
+
+ if (nameNode != null) {
+ if (contentNode != null) {
+ String name = nameNode.getNodeValue().toLowerCase();
+ metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
+ if ("robots".equals(name)) {
+
+ if (contentNode != null) {
+ String directives = contentNode.getNodeValue().toLowerCase();
+ int index = directives.indexOf("none");
+
+ if (index >= 0) {
+ metaTags.setNoIndex();
+ metaTags.setNoFollow();
+ }
+
+ index = directives.indexOf("all");
+ if (index >= 0) {
+ // do nothing...
+ }
+
+ index = directives.indexOf("noindex");
+ if (index >= 0) {
+ metaTags.setNoIndex();
+ }
+
+ index = directives.indexOf("nofollow");
+ if (index >= 0) {
+ metaTags.setNoFollow();
+ }
+
+ index = directives.indexOf("noarchive");
+ if (index >= 0) {
+ metaTags.setNoCache();
+ }
+ }
+
+ } // end if (name == robots)
+ }
+ }
+
+ if (equivNode != null) {
+ if (contentNode != null) {
+ String name = equivNode.getNodeValue().toLowerCase();
+ String content = contentNode.getNodeValue();
+ metaTags.getHttpEquivTags().setProperty(name, content);
+ if ("pragma".equals(name)) {
+ content = content.toLowerCase();
+ int index = content.indexOf("no-cache");
+ if (index >= 0)
+ metaTags.setNoCache();
+ } else if ("refresh".equals(name)) {
+ int idx = content.indexOf(';');
+ String time = null;
+ if (idx == -1) { // just the refresh time
+ time = content;
+ } else
+ time = content.substring(0, idx);
+ try {
+ metaTags.setRefreshTime(Integer.parseInt(time));
+ // skip this if we couldn't parse the time
+ metaTags.setRefresh(true);
+ } catch (Exception e) {
+ ;
+ }
+ URL refreshUrl = null;
+ if (metaTags.getRefresh() && idx != -1) { // set the URL
+ idx = content.toLowerCase().indexOf("url=");
+ if (idx == -1) { // assume a mis-formatted entry with just the
+ // url
+ idx = content.indexOf(';') + 1;
+ } else
+ idx += 4;
+ if (idx != -1) {
+ String url = content.substring(idx);
+ try {
+ refreshUrl = new URL(url);
+ } catch (Exception e) {
+ // XXX according to the spec, this has to be an absolute
+ // XXX url. However, many websites use relative URLs and
+ // XXX expect browsers to handle that.
+ // XXX Unfortunately, in some cases this may create a
+ // XXX infinitely recursive paths (a crawler trap)...
+ // if (!url.startsWith("/")) url = "/" + url;
+ try {
+ refreshUrl = new URL(currURL, url);
+ } catch (Exception e1) {
+ refreshUrl = null;
+ }
+ }
+ }
+ }
+ if (metaTags.getRefresh()) {
+ if (refreshUrl == null) {
+ // apparently only refresh time was present. set the URL
+ // to the same URL.
+ refreshUrl = currURL;
+ }
+ metaTags.setRefreshHref(refreshUrl);
+ }
+ }
+ }
+ }
+
+ } else if ("base".equalsIgnoreCase(node.getNodeName())) {
+ NamedNodeMap attrs = node.getAttributes();
+ Node hrefNode = attrs.getNamedItem("href");
+
+ if (hrefNode != null) {
+ String urlString = hrefNode.getNodeValue();
+
+ URL url = null;
+ try {
+ if (currURL == null)
+ url = new URL(urlString);
+ else
+ url = new URL(currURL, urlString);
+ } catch (Exception e) {
+ ;
+ }
+
+ if (url != null)
+ metaTags.setBaseHref(url);
+ }
+
+ }
+
+ }
+
+ NodeList children = node.getChildNodes();
+ if (children != null) {
+ int len = children.getLength();
+ for (int i = 0; i < len; i++) {
+ getMetaTagsHelper(metaTags, children.item(i), currURL);
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/TikaParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/TikaParser.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/TikaParser.java
new file mode 100644
index 0000000..5d7eca9
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -0,0 +1,286 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import java.io.ByteArrayInputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.html.dom.HTMLDocumentImpl;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilters;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.protocol.Content;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.HtmlMapper;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.sax.Link;
+import org.apache.tika.sax.LinkContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.DocumentFragment;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Wrapper for Tika parsers. Mimics the HTMLParser but using the XHTML
+ * representation returned by Tika as SAX events
+ ***/
+
+public class TikaParser implements org.apache.nutch.parse.Parser {
+
+ public static final Logger LOG = LoggerFactory.getLogger(TikaParser.class);
+
+ private Configuration conf;
+ private TikaConfig tikaConfig = null;
+ private DOMContentUtils utils;
+ private HtmlParseFilters htmlParseFilters;
+ private String cachingPolicy;
+ private HtmlMapper HTMLMapper;
+ private boolean upperCaseElementNames = true;
+
+ @SuppressWarnings("deprecation")
+ public ParseResult getParse(Content content) {
+ String mimeType = content.getContentType();
+
+ boolean useBoilerpipe = getConf().get("tika.extractor", "none").equals("boilerpipe");
+ String boilerpipeExtractorName = getConf().get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
+
+ URL base;
+ try {
+ base = new URL(content.getBaseUrl());
+ } catch (MalformedURLException e) {
+ return new ParseStatus(e)
+ .getEmptyParseResult(content.getUrl(), getConf());
+ }
+
+ // get the right parser using the mime type as a clue
+ Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
+ byte[] raw = content.getContent();
+
+ if (parser == null) {
+ String message = "Can't retrieve Tika parser for mime-type " + mimeType;
+ LOG.error(message);
+ return new ParseStatus(ParseStatus.FAILED, message).getEmptyParseResult(
+ content.getUrl(), getConf());
+ }
+
+ LOG.debug("Using Tika parser " + parser.getClass().getName()
+ + " for mime-type " + mimeType);
+
+ Metadata tikamd = new Metadata();
+
+ HTMLDocumentImpl doc = new HTMLDocumentImpl();
+ doc.setErrorChecking(false);
+ DocumentFragment root = doc.createDocumentFragment();
+
+ ContentHandler domHandler;
+
+ // Check whether to use Tika's BoilerplateContentHandler
+ if (useBoilerpipe) {
+ BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler((ContentHandler)new DOMBuilder(doc, root),
+ BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
+ bpHandler.setIncludeMarkup(true);
+ domHandler = (ContentHandler)bpHandler;
+ } else {
+ DOMBuilder domBuilder = new DOMBuilder(doc, root);
+ domBuilder.setUpperCaseElementNames(upperCaseElementNames);
+ domBuilder.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
+ domHandler = (ContentHandler)domBuilder;
+ }
+
+ LinkContentHandler linkContentHandler = new LinkContentHandler();
+
+ ParseContext context = new ParseContext();
+ TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, linkContentHandler);
+
+ if (HTMLMapper != null)
+ context.set(HtmlMapper.class, HTMLMapper);
+ tikamd.set(Metadata.CONTENT_TYPE, mimeType);
+ try {
+ parser.parse(new ByteArrayInputStream(raw), (ContentHandler)teeContentHandler, tikamd, context);
+ } catch (Exception e) {
+ LOG.error("Error parsing " + content.getUrl(), e);
+ return new ParseStatus(ParseStatus.FAILED, e.getMessage())
+ .getEmptyParseResult(content.getUrl(), getConf());
+ }
+
+ HTMLMetaTags metaTags = new HTMLMetaTags();
+ String text = "";
+ String title = "";
+ Outlink[] outlinks = new Outlink[0];
+ org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();
+
+ // we have converted the sax events generated by Tika into a DOM object
+ // so we can now use the usual HTML resources from Nutch
+ // get meta directives
+ HTMLMetaProcessor.getMetaTags(metaTags, root, base);
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
+ }
+
+ // check meta directives
+ if (!metaTags.getNoIndex()) { // okay to index
+ StringBuffer sb = new StringBuffer();
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Getting text...");
+ }
+ utils.getText(sb, root); // extract text
+ text = sb.toString();
+ sb.setLength(0);
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Getting title...");
+ }
+ utils.getTitle(sb, root); // extract title
+ title = sb.toString().trim();
+ }
+
+ if (!metaTags.getNoFollow()) { // okay to follow links
+ ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
+ URL baseTag = utils.getBase(root);
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Getting links...");
+ }
+
+ // pre-1233 outlink extraction
+ //utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
+ // Get outlinks from Tika
+ List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks();
+ utils.getOutlinks(baseTag != null ? baseTag : base, l, tikaExtractedOutlinks);
+ outlinks = l.toArray(new Outlink[l.size()]);
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("found " + outlinks.length + " outlinks in "
+ + content.getUrl());
+ }
+ }
+
+ // populate Nutch metadata with Tika metadata
+ String[] TikaMDNames = tikamd.names();
+ for (String tikaMDName : TikaMDNames) {
+ if (tikaMDName.equalsIgnoreCase(Metadata.TITLE))
+ continue;
+ String[] values = tikamd.getValues(tikaMDName);
+ for (String v : values)
+ nutchMetadata.add(tikaMDName, v);
+ }
+
+ // no outlinks? try OutlinkExtractor e.g works for mime types where no
+ // explicit markup for anchors
+
+ if (outlinks.length == 0) {
+ outlinks = OutlinkExtractor.getOutlinks(text, getConf());
+ }
+
+ ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
+ if (metaTags.getRefresh()) {
+ status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
+ status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
+ Integer.toString(metaTags.getRefreshTime()) });
+ }
+ ParseData parseData = new ParseData(status, title, outlinks,
+ content.getMetadata(), nutchMetadata);
+ ParseResult parseResult = ParseResult.createParseResult(content.getUrl(),
+ new ParseImpl(text, parseData));
+
+ // run filters on parse
+ ParseResult filteredParse = this.htmlParseFilters.filter(content,
+ parseResult, metaTags, root);
+ if (metaTags.getNoCache()) { // not okay to cache
+ for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
+ entry.getValue().getData().getParseMeta()
+ .set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
+ }
+ return filteredParse;
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ this.tikaConfig = null;
+
+ // do we want a custom Tika configuration file
+ // deprecated since Tika 0.7 which is based on
+ // a service provider based configuration
+ String customConfFile = conf.get("tika.config.file");
+ if (customConfFile != null) {
+ try {
+ // see if a Tika config file can be found in the job file
+ URL customTikaConfig = conf.getResource(customConfFile);
+ if (customTikaConfig != null)
+ tikaConfig = new TikaConfig(customTikaConfig);
+ } catch (Exception e1) {
+ String message = "Problem loading custom Tika configuration from "
+ + customConfFile;
+ LOG.error(message, e1);
+ }
+ } else {
+ try {
+ tikaConfig = new TikaConfig(this.getClass().getClassLoader());
+ } catch (Exception e2) {
+ String message = "Problem loading default Tika configuration";
+ LOG.error(message, e2);
+ }
+ }
+
+ // use a custom htmlmapper
+ String htmlmapperClassName = conf.get("tika.htmlmapper.classname");
+ if (StringUtils.isNotBlank(htmlmapperClassName)) {
+ try {
+ Class HTMLMapperClass = Class.forName(htmlmapperClassName);
+ boolean interfaceOK = HtmlMapper.class
+ .isAssignableFrom(HTMLMapperClass);
+ if (!interfaceOK) {
+ throw new RuntimeException("Class " + htmlmapperClassName
+ + " does not implement HtmlMapper");
+ }
+ HTMLMapper = (HtmlMapper) HTMLMapperClass.newInstance();
+ } catch (Exception e) {
+ LOG.error("Can't generate instance for class " + htmlmapperClassName);
+ throw new RuntimeException("Can't generate instance for class "
+ + htmlmapperClassName);
+ }
+ }
+
+ this.htmlParseFilters = new HtmlParseFilters(getConf());
+ this.utils = new DOMContentUtils(conf);
+ this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
+ Nutch.CACHING_FORBIDDEN_CONTENT);
+ this.upperCaseElementNames = getConf().getBoolean(
+ "tika.uppercase.element.names", true);
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
new file mode 100644
index 0000000..d625c33
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
@@ -0,0 +1,112 @@
+/*
+ * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
+ * XXX distribution, org.apache.xml.utils.XMLCharacterRecognizer,
+ * XXX in order to avoid dependency on Xalan.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Id: XMLCharacterRecognizer.java 823614 2009-10-09 17:02:32Z ab $
+ */
+package org.apache.nutch.parse.tika;
+
+/**
+ * Class used to verify whether the specified <var>ch</var> conforms to the XML
+ * 1.0 definition of whitespace.
+ */
+class XMLCharacterRecognizer {
+
+ /**
+ * Returns whether the specified <var>ch</var> conforms to the XML 1.0
+ * definition of whitespace. Refer to <A
+ * href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S"> the definition of
+ * <CODE>S</CODE></A> for details.
+ *
+ * @param ch
+ * Character to check as XML whitespace.
+ * @return =true if <var>ch</var> is XML whitespace; otherwise =false.
+ */
+ static boolean isWhiteSpace(char ch) {
+ return (ch == 0x20) || (ch == 0x09) || (ch == 0xD) || (ch == 0xA);
+ }
+
+ /**
+ * Tell if the string is whitespace.
+ *
+ * @param ch
+ * Character array to check as XML whitespace.
+ * @param start
+ * Start index of characters in the array
+ * @param length
+ * Number of characters in the array
+ * @return True if the characters in the array are XML whitespace; otherwise,
+ * false.
+ */
+ static boolean isWhiteSpace(char ch[], int start, int length) {
+
+ int end = start + length;
+
+ for (int s = start; s < end; s++) {
+ if (!isWhiteSpace(ch[s]))
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * Tell if the string is whitespace.
+ *
+ * @param buf
+ * StringBuffer to check as XML whitespace.
+ * @return True if characters in buffer are XML whitespace, false otherwise
+ */
+ static boolean isWhiteSpace(StringBuffer buf) {
+
+ int n = buf.length();
+
+ for (int i = 0; i < n; i++) {
+ if (!isWhiteSpace(buf.charAt(i)))
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * Tell if the string is whitespace.
+ *
+ * @param s
+ * String to check as XML whitespace.
+ * @return True if characters in buffer are XML whitespace, false otherwise
+ */
+ static boolean isWhiteSpace(String s) {
+
+ if (null != s) {
+ int n = s.length();
+
+ for (int i = 0; i < n; i++) {
+ if (!isWhiteSpace(s.charAt(i)))
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/package-info.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/package-info.java
new file mode 100644
index 0000000..19e3f47
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse various document formats with help of
+ * <a href="http://tika.apache.org/">Apache Tika</a>.
+ */
+package org.apache.nutch.parse.tika;
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
new file mode 100644
index 0000000..96029a6
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
@@ -0,0 +1,337 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.tika.DOMContentUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+import java.io.ByteArrayInputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.StringTokenizer;
+
+import org.xml.sax.*;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+import org.cyberneko.html.parsers.DOMFragmentParser;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Unit tests for DOMContentUtils.
+ */
+public class TestDOMContentUtils {
+
+ private static final String[] testPages = {
+
+ new String("<html><head><title> title </title><script> script </script>"
+ + "</head><body> body <a href=\"http://www.nutch.org\">"
+ + " anchor </a><!--comment-->" + "</body></html>"),
+
+ new String("<html><head><title> title </title><script> script </script>"
+ + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->"
+ + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>"
+ + "</body></html>"),
+
+ new String("<html><head><title> </title>" + "</head><body> "
+ + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
+ + "</a></a>" + "</body></html>"),
+
+ // this one relies on certain neko fixup behavior, possibly
+ // distributing the anchors into the LI's-but not the other
+ // anchors (outside of them, instead)! So you get a tree that
+ // looks like:
+ // ... <li> <a href=/> home </a> </li>
+ // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
+ // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
+ new String("<html><head><title> my title </title>"
+ + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home"
+ + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>"
+ + "</body></html>"),
+
+ // test frameset link extraction. The invalid frame in the middle
+ // will be
+ // fixed to a third standalone frame.
+ new String("<html><head><title> my title </title>"
+ + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">"
+ + "</frame>" + "<frameset cols=\"20,*\">"
+ + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>"
+ + "</frame>" + "<frame src=\"right.html\">" + "</frame>"
+ + "</frameset>" + "</frameset>" + "</body></html>"),
+
+ // test <area> and <iframe> link extraction + url normalization
+ new String(
+ "<html><head><title> my title </title>"
+ + "</head><body>"
+ + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
+ + "<map name=\"green\">"
+ + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
+ + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
+ + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
+ + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> "
+ + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"),
+
+ // test whitespace processing for plain text extraction
+ new String(
+ "<html><head>\n <title> my\t\n title\r\n </title>\n"
+ + " </head>\n"
+ + " <body>\n"
+ + " <h1> Whitespace\ttest </h1> \n"
+ + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n"
+ + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
+ + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
+ + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n"
+ + "<table>"
+ + " <tr><td>one</td><td>two</td><td>three</td></tr>\n"
+ + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
+ + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
+ + "</table>put some text here<Br>and there."
+ + "<h2>End\tthis\rmadness\n!</h2>\r\n"
+ + " . . . ." + "</body> </html>"),
+
+ // test that <a rel=nofollow> links are not returned
+ new String("<html><head></head><body>"
+ + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
+ + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
+ + "</body></html>"),
+ // test that POST form actions are skipped
+ new String("<html><head></head><body>"
+ + "<form method='POST' action='/search.jsp'><input type=text>"
+ + "<input type=submit><p>test1</p></form>"
+ + "<form method='GET' action='/dummy.jsp'><input type=text>"
+ + "<input type=submit><p>test2</p></form></body></html>"),
+ // test that all form actions are skipped
+ new String("<html><head></head><body>"
+ + "<form method='POST' action='/search.jsp'><input type=text>"
+ + "<input type=submit><p>test1</p></form>"
+ + "<form method='GET' action='/dummy.jsp'><input type=text>"
+ + "<input type=submit><p>test2</p></form></body></html>"),
+ new String("<html><head><title> title </title>" + "</head><body>"
+ + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>"
+ + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
+ new String("<html><head><title> title </title>" + "</head><body>"
+ + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
+ + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
+ + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), };
+
+ private static int SKIP = 9;
+
+ private static String[] testBaseHrefs = { "http://www.nutch.org",
+ "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/",
+ "http://www.nutch.org/docs/", "http://www.nutch.org/frames/",
+ "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/",
+ "http://www.nutch.org//", "http://www.nutch.org/",
+ "http://www.nutch.org/", "http://www.nutch.org/",
+ "http://www.nutch.org/;something" };
+
+ private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length];
+
+ private static URL[] testBaseHrefURLs = new URL[testPages.length];
+
+ private static final String[] answerText = {
+ "title body anchor",
+ "title body home bots",
+ "separate this from this",
+ "my title body home 1 2",
+ "my title",
+ "my title the bottom",
+ "my title Whitespace test whitespace test "
+ + "This is a whitespace test . Newlines should appear as space too. "
+ + "Tabs are spaces too. This is a break -> and the line after break . "
+ + "one two three space here space there no space "
+ + "one two two three three four put some text here and there. "
+ + "End this madness ! . . . .", "ignore ignore", "test1 test2",
+ "test1 test2", "title anchor1 anchor2 anchor3",
+ "title anchor1 anchor2 anchor3 anchor4 anchor5" };
+
+ private static final String[] answerTitle = { "title", "title", "",
+ "my title", "my title", "my title", "my title", "", "", "", "title",
+ "title" };
+
+ // note: should be in page-order
+ private static Outlink[][] answerOutlinks;
+
+ private static Configuration conf;
+ private static DOMContentUtils utils = null;
+
+ @Before
+ public void setup() throws Exception {
+ conf = NutchConfiguration.create();
+ conf.setBoolean("parser.html.form.use_action", true);
+ utils = new DOMContentUtils(conf);
+ DOMFragmentParser parser = new DOMFragmentParser();
+ parser.setFeature(
+ "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
+ true);
+ for (int i = 0; i < testPages.length; i++) {
+ DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+ try {
+ parser.parse(
+ new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
+ node);
+ testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
+ } catch (Exception e) {
+ Assert.assertTrue("caught exception: " + e, false);
+ }
+ testDOMs[i] = node;
+ }
+ answerOutlinks = new Outlink[][] {
+ { new Outlink("http://www.nutch.org", "anchor"), },
+ { new Outlink("http://www.nutch.org/", "home"),
+ new Outlink("http://www.nutch.org/docs/bot.html", "bots"), },
+ { new Outlink("http://www.nutch.org/", "separate this"),
+ new Outlink("http://www.nutch.org/docs/ok", "from this"), },
+ { new Outlink("http://www.nutch.org/", "home"),
+ new Outlink("http://www.nutch.org/docs/1", "1"),
+ new Outlink("http://www.nutch.org/docs/2", "2"), },
+ { new Outlink("http://www.nutch.org/frames/top.html", ""),
+ new Outlink("http://www.nutch.org/frames/left.html", ""),
+ new Outlink("http://www.nutch.org/frames/invalid.html", ""),
+ new Outlink("http://www.nutch.org/frames/right.html", ""), },
+ { new Outlink("http://www.nutch.org/maps/logo.gif", ""),
+ new Outlink("http://www.nutch.org/index.html", ""),
+ new Outlink("http://www.nutch.org/maps/#bottom", ""),
+ new Outlink("http://www.nutch.org/bot.html", ""),
+ new Outlink("http://www.nutch.org/docs/index.html", ""), },
+ { new Outlink("http://www.nutch.org/index.html", "whitespace test"), },
+ {},
+ { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
+ {},
+ { new Outlink("http://www.nutch.org/;x", "anchor1"),
+ new Outlink("http://www.nutch.org/g;x", "anchor2"),
+ new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") },
+ {
+ // this is tricky - see RFC3986 section 5.4.1 example 7
+ new Outlink("http://www.nutch.org/g", "anchor1"),
+ new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
+ new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
+ new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
+ new Outlink("http://www.nutch.org/;something?y=1;somethingelse",
+ "anchor5") } };
+
+ }
+
+ private static boolean equalsIgnoreWhitespace(String s1, String s2) {
+ StringTokenizer st1 = new StringTokenizer(s1);
+ StringTokenizer st2 = new StringTokenizer(s2);
+
+ while (st1.hasMoreTokens()) {
+ if (!st2.hasMoreTokens())
+ return false;
+ if (!st1.nextToken().equals(st2.nextToken()))
+ return false;
+ }
+ if (st2.hasMoreTokens())
+ return false;
+ return true;
+ }
+
+ @Test
+ public void testGetText() throws Exception {
+ if (testDOMs[0] == null)
+ setup();
+ for (int i = 0; i < testPages.length; i++) {
+ StringBuffer sb = new StringBuffer();
+ utils.getText(sb, testDOMs[i]);
+ String text = sb.toString();
+ Assert.assertTrue(
+ "expecting text: " + answerText[i]
+ + System.getProperty("line.separator")
+ + System.getProperty("line.separator") + "got text: " + text,
+ equalsIgnoreWhitespace(answerText[i], text));
+ }
+ }
+
+ @Test
+ public void testGetTitle() throws Exception {
+ if (testDOMs[0] == null)
+ setup();
+ for (int i = 0; i < testPages.length; i++) {
+ StringBuffer sb = new StringBuffer();
+ utils.getTitle(sb, testDOMs[i]);
+ String text = sb.toString();
+ Assert.assertTrue(
+ "expecting text: " + answerText[i]
+ + System.getProperty("line.separator")
+ + System.getProperty("line.separator") + "got text: " + text,
+ equalsIgnoreWhitespace(answerTitle[i], text));
+ }
+ }
+
+ @Test
+ public void testGetOutlinks() throws Exception {
+ if (testDOMs[0] == null)
+ setup();
+ for (int i = 0; i < testPages.length; i++) {
+ ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
+ if (i == SKIP) {
+ conf.setBoolean("parser.html.form.use_action", false);
+ utils.setConf(conf);
+ } else {
+ conf.setBoolean("parser.html.form.use_action", true);
+ utils.setConf(conf);
+ }
+ utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
+ Outlink[] outlinkArr = new Outlink[outlinks.size()];
+ outlinkArr = outlinks.toArray(outlinkArr);
+ compareOutlinks(answerOutlinks[i], outlinkArr);
+ }
+ }
+
+ private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
+ for (int i = 0; i < o.length; i++) {
+ sb.append(o[i].toString());
+ sb.append(System.getProperty("line.separator"));
+ }
+ }
+
+ private static final String outlinksString(Outlink[] o) {
+ StringBuffer sb = new StringBuffer();
+ appendOutlinks(sb, o);
+ return sb.toString();
+ }
+
+ private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
+ if (o1.length != o2.length) {
+ Assert.assertTrue(
+ "got wrong number of outlinks (expecting " + o1.length + ", got "
+ + o2.length + ")" + System.getProperty("line.separator")
+ + "answer: " + System.getProperty("line.separator")
+ + outlinksString(o1) + System.getProperty("line.separator")
+ + "got: " + System.getProperty("line.separator")
+ + outlinksString(o2) + System.getProperty("line.separator"),
+ false);
+ }
+
+ for (int i = 0; i < o1.length; i++) {
+ if (!o1[i].equals(o2[i])) {
+ Assert.assertTrue(
+ "got wrong outlinks at position " + i
+ + System.getProperty("line.separator") + "answer: "
+ + System.getProperty("line.separator") + "'" + o1[i].getToUrl()
+ + "', anchor: '" + o1[i].getAnchor() + "'"
+ + System.getProperty("line.separator") + "got: "
+ + System.getProperty("line.separator") + "'" + o2[i].getToUrl()
+ + "', anchor: '" + o2[i].getAnchor() + "'", false);
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
new file mode 100644
index 0000000..c9394dc
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.tika.TikaParser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ *
+ * @author mattmann / jnioche
+ *
+ * Test Suite for the RSS feeds with the {@link TikaParser}.
+ *
+ */
+public class TestFeedParser {
+
+ private String fileSeparator = System.getProperty("file.separator");
+
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data", ".");
+
+ private String[] sampleFiles = { "rsstest.rss" };
+
+ public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class
+ .getName());
+
+ /**
+ * <p>
+ * The test method: tests out the following 2 asserts:
+ * </p>
+ *
+ * <ul>
+ * <li>There are 3 outlinks read from the sample rss file</li>
+ * <li>The 3 outlinks read are in fact the correct outlinks from the sample
+ * file</li>
+ * </ul>
+ */
+ @Test
+ public void testIt() throws ProtocolException, ParseException {
+ String urlString;
+ Protocol protocol;
+ Content content;
+ Parse parse;
+
+ Configuration conf = NutchConfiguration.create();
+ for (int i = 0; i < sampleFiles.length; i++) {
+ urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+ protocol = new ProtocolFactory(conf).getProtocol(urlString);
+ content = protocol.getProtocolOutput(new Text(urlString),
+ new CrawlDatum()).getContent();
+ parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+ .get(content.getUrl());
+
+ // check that there are 2 outlinks:
+ // unlike the original parse-rss
+ // tika ignores the URL and description of the channel
+
+ // http://test.channel.com
+ // http://www-scf.usc.edu/~mattmann/
+ // http://www.nutch.org
+
+ ParseData theParseData = parse.getData();
+
+ Outlink[] theOutlinks = theParseData.getOutlinks();
+
+ Assert.assertTrue("There aren't 2 outlinks read!",
+ theOutlinks.length == 2);
+
+ // now check to make sure that those are the two outlinks
+ boolean hasLink1 = false, hasLink2 = false;
+
+ for (int j = 0; j < theOutlinks.length; j++) {
+ if (theOutlinks[j].getToUrl().equals(
+ "http://www-scf.usc.edu/~mattmann/")) {
+ hasLink1 = true;
+ }
+
+ if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) {
+ hasLink2 = true;
+ }
+ }
+
+ if (!hasLink1 || !hasLink2) {
+ Assert.fail("Outlinks read from sample rss file are not correct!");
+ }
+ }
+ }
+
+}