You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:48:55 UTC
[11/69] [abbrv] [partial] nutch git commit: Re arranged the source
code as per maven conventions for build
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
deleted file mode 100644
index 6a1038b..0000000
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
+++ /dev/null
@@ -1,766 +0,0 @@
-/*
- * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
- * XXX distribution, org.apache.xml.utils.DOMBuilder, in order to
- * avoid dependency on Xalan.
- */
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Id$
- */
-package org.apache.nutch.parse.html;
-
-import java.util.Stack;
-
-import org.w3c.dom.Comment;
-import org.w3c.dom.Document;
-import org.w3c.dom.DocumentFragment;
-import org.w3c.dom.Element;
-import org.w3c.dom.Node;
-import org.w3c.dom.Text;
-import org.w3c.dom.CDATASection;
-
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.Locator;
-import org.xml.sax.ext.LexicalHandler;
-
-/**
- * This class takes SAX events (in addition to some extra events that SAX
- * doesn't handle yet) and adds the result to a document or document fragment.
- */
-public class DOMBuilder implements ContentHandler, LexicalHandler {
-
- /** Root document */
- public Document m_doc;
-
- /** Current node */
- protected Node m_currentNode = null;
-
- /** First node of document fragment or null if not a DocumentFragment */
- public DocumentFragment m_docFrag = null;
-
- /** Vector of element nodes */
- protected Stack<Element> m_elemStack = new Stack<Element>();
-
- /**
- * DOMBuilder instance constructor... it will add the DOM nodes to the
- * document fragment.
- *
- * @param doc
- * Root document
- * @param node
- * Current node
- */
- public DOMBuilder(Document doc, Node node) {
- m_doc = doc;
- m_currentNode = node;
- }
-
- /**
- * DOMBuilder instance constructor... it will add the DOM nodes to the
- * document fragment.
- *
- * @param doc
- * Root document
- * @param docFrag
- * Document fragment
- */
- public DOMBuilder(Document doc, DocumentFragment docFrag) {
- m_doc = doc;
- m_docFrag = docFrag;
- }
-
- /**
- * DOMBuilder instance constructor... it will add the DOM nodes to the
- * document.
- *
- * @param doc
- * Root document
- */
- public DOMBuilder(Document doc) {
- m_doc = doc;
- }
-
- /**
- * Get the root node of the DOM being created. This is either a Document or a
- * DocumentFragment.
- *
- * @return The root document or document fragment if not null
- */
- public Node getRootNode() {
- return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc;
- }
-
- /**
- * Get the node currently being processed.
- *
- * @return the current node being processed
- */
- public Node getCurrentNode() {
- return m_currentNode;
- }
-
- /**
- * Return null since there is no Writer for this class.
- *
- * @return null
- */
- public java.io.Writer getWriter() {
- return null;
- }
-
- /**
- * Append a node to the current container.
- *
- * @param newNode
- * New node to append
- */
- protected void append(Node newNode) throws org.xml.sax.SAXException {
-
- Node currentNode = m_currentNode;
-
- if (null != currentNode) {
- currentNode.appendChild(newNode);
-
- // System.out.println(newNode.getNodeName());
- } else if (null != m_docFrag) {
- m_docFrag.appendChild(newNode);
- } else {
- boolean ok = true;
- short type = newNode.getNodeType();
-
- if (type == Node.TEXT_NODE) {
- String data = newNode.getNodeValue();
-
- if ((null != data) && (data.trim().length() > 0)) {
- throw new org.xml.sax.SAXException(
- "Warning: can't output text before document element! Ignoring...");
- }
-
- ok = false;
- } else if (type == Node.ELEMENT_NODE) {
- if (m_doc.getDocumentElement() != null) {
- throw new org.xml.sax.SAXException(
- "Can't have more than one root on a DOM!");
- }
- }
-
- if (ok)
- m_doc.appendChild(newNode);
- }
- }
-
- /**
- * Receive an object for locating the origin of SAX document events.
- *
- * <p>
- * SAX parsers are strongly encouraged (though not absolutely required) to
- * supply a locator: if it does so, it must supply the locator to the
- * application by invoking this method before invoking any of the other
- * methods in the ContentHandler interface.
- * </p>
- *
- * <p>
- * The locator allows the application to determine the end position of any
- * document-related event, even if the parser is not reporting an error.
- * Typically, the application will use this information for reporting its own
- * errors (such as character content that does not match an application's
- * business rules). The information returned by the locator is probably not
- * sufficient for use with a search engine.
- * </p>
- *
- * <p>
- * Note that the locator will return correct information only during the
- * invocation of the events in this interface. The application should not
- * attempt to use it at any other time.
- * </p>
- *
- * @param locator
- * An object that can return the location of any SAX document event.
- * @see org.xml.sax.Locator
- */
- public void setDocumentLocator(Locator locator) {
-
- // No action for the moment.
- }
-
- /**
- * Receive notification of the beginning of a document.
- *
- * <p>
- * The SAX parser will invoke this method only once, before any other methods
- * in this interface or in DTDHandler (except for setDocumentLocator).
- * </p>
- */
- public void startDocument() throws org.xml.sax.SAXException {
-
- // No action for the moment.
- }
-
- /**
- * Receive notification of the end of a document.
- *
- * <p>
- * The SAX parser will invoke this method only once, and it will be the last
- * method invoked during the parse. The parser shall not invoke this method
- * until it has either abandoned parsing (because of an unrecoverable error)
- * or reached the end of input.
- * </p>
- */
- public void endDocument() throws org.xml.sax.SAXException {
-
- // No action for the moment.
- }
-
- /**
- * Receive notification of the beginning of an element.
- *
- * <p>
- * The Parser will invoke this method at the beginning of every element in the
- * XML document; there will be a corresponding endElement() event for every
- * startElement() event (even when the element is empty). All of the element's
- * content will be reported, in order, before the corresponding endElement()
- * event.
- * </p>
- *
- * <p>
- * If the element name has a namespace prefix, the prefix will still be
- * attached. Note that the attribute list provided will contain only
- * attributes with explicit values (specified or defaulted): #IMPLIED
- * attributes will be omitted.
- * </p>
- *
- *
- * @param ns
- * The namespace of the node
- * @param localName
- * The local part of the qualified name
- * @param name
- * The element name.
- * @param atts
- * The attributes attached to the element, if any.
- * @see #endElement
- * @see org.xml.sax.Attributes
- */
- public void startElement(String ns, String localName, String name,
- Attributes atts) throws org.xml.sax.SAXException {
-
- Element elem;
-
- // Note that the namespace-aware call must be used to correctly
- // construct a Level 2 DOM, even for non-namespaced nodes.
- if ((null == ns) || (ns.length() == 0))
- elem = m_doc.createElementNS(null, name);
- else
- elem = m_doc.createElementNS(ns, name);
-
- append(elem);
-
- try {
- int nAtts = atts.getLength();
-
- if (0 != nAtts) {
- for (int i = 0; i < nAtts; i++) {
-
- // System.out.println("type " + atts.getType(i) + " name " +
- // atts.getLocalName(i) );
- // First handle a possible ID attribute
- if (atts.getType(i).equalsIgnoreCase("ID"))
- setIDAttribute(atts.getValue(i), elem);
-
- String attrNS = atts.getURI(i);
-
- if ("".equals(attrNS))
- attrNS = null; // DOM represents no-namespace as null
-
- // System.out.println("attrNS: "+attrNS+", localName: "+atts.getQName(i)
- // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i));
- // Crimson won't let us set an xmlns: attribute on the DOM.
- String attrQName = atts.getQName(i);
-
- // In SAX, xmlns: attributes have an empty namespace, while in DOM
- // they should have the xmlns namespace
- if (attrQName.startsWith("xmlns:"))
- attrNS = "http://www.w3.org/2000/xmlns/";
-
- // ALWAYS use the DOM Level 2 call!
- elem.setAttributeNS(attrNS, attrQName, atts.getValue(i));
- }
- }
-
- // append(elem);
-
- m_elemStack.push(elem);
-
- m_currentNode = elem;
-
- // append(elem);
- } catch (java.lang.Exception de) {
- // de.printStackTrace();
- throw new org.xml.sax.SAXException(de);
- }
-
- }
-
- /**
- *
- *
- *
- * Receive notification of the end of an element.
- *
- * <p>
- * The SAX parser will invoke this method at the end of every element in the
- * XML document; there will be a corresponding startElement() event for every
- * endElement() event (even when the element is empty).
- * </p>
- *
- * <p>
- * If the element name has a namespace prefix, the prefix will still be
- * attached to the name.
- * </p>
- *
- *
- * @param ns
- * the namespace of the element
- * @param localName
- * The local part of the qualified name of the element
- * @param name
- * The element name
- */
- public void endElement(String ns, String localName, String name)
- throws org.xml.sax.SAXException {
- m_elemStack.pop();
- m_currentNode = m_elemStack.isEmpty() ? null : (Node) m_elemStack.peek();
- }
-
- /**
- * Set an ID string to node association in the ID table.
- *
- * @param id
- * The ID string.
- * @param elem
- * The associated ID.
- */
- public void setIDAttribute(String id, Element elem) {
-
- // Do nothing. This method is meant to be overiden.
- }
-
- /**
- * Receive notification of character data.
- *
- * <p>
- * The Parser will call this method to report each chunk of character data.
- * SAX parsers may return all contiguous character data in a single chunk, or
- * they may split it into several chunks; however, all of the characters in
- * any single event must come from the same external entity, so that the
- * Locator provides useful information.
- * </p>
- *
- * <p>
- * The application must not attempt to read from the array outside of the
- * specified range.
- * </p>
- *
- * <p>
- * Note that some parsers will report whitespace using the
- * ignorableWhitespace() method rather than this one (validating parsers must
- * do so).
- * </p>
- *
- * @param ch
- * The characters from the XML document.
- * @param start
- * The start position in the array.
- * @param length
- * The number of characters to read from the array.
- * @see #ignorableWhitespace
- * @see org.xml.sax.Locator
- */
- public void characters(char ch[], int start, int length)
- throws org.xml.sax.SAXException {
- if (isOutsideDocElem()
- && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
- return; // avoid DOM006 Hierarchy request error
-
- if (m_inCData) {
- cdata(ch, start, length);
-
- return;
- }
-
- String s = new String(ch, start, length);
- Node childNode;
- childNode = m_currentNode != null ? m_currentNode.getLastChild() : null;
- if (childNode != null && childNode.getNodeType() == Node.TEXT_NODE) {
- ((Text) childNode).appendData(s);
- } else {
- Text text = m_doc.createTextNode(s);
- append(text);
- }
- }
-
- /**
- * If available, when the disable-output-escaping attribute is used, output
- * raw text without escaping. A PI will be inserted in front of the node with
- * the name "lotusxsl-next-is-raw" and a value of "formatter-to-dom".
- *
- * @param ch
- * Array containing the characters
- * @param start
- * Index to start of characters in the array
- * @param length
- * Number of characters in the array
- */
- public void charactersRaw(char ch[], int start, int length)
- throws org.xml.sax.SAXException {
- if (isOutsideDocElem()
- && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
- return; // avoid DOM006 Hierarchy request error
-
- String s = new String(ch, start, length);
-
- append(m_doc.createProcessingInstruction("xslt-next-is-raw",
- "formatter-to-dom"));
- append(m_doc.createTextNode(s));
- }
-
- /**
- * Report the beginning of an entity.
- *
- * The start and end of the document entity are not reported. The start and
- * end of the external DTD subset are reported using the pseudo-name "[dtd]".
- * All other events must be properly nested within start/end entity events.
- *
- * @param name
- * The name of the entity. If it is a parameter entity, the name will
- * begin with '%'.
- * @see #endEntity
- * @see org.xml.sax.ext.DeclHandler#internalEntityDecl
- * @see org.xml.sax.ext.DeclHandler#externalEntityDecl
- */
- public void startEntity(String name) throws org.xml.sax.SAXException {
-
- // Almost certainly the wrong behavior...
- // entityReference(name);
- }
-
- /**
- * Report the end of an entity.
- *
- * @param name
- * The name of the entity that is ending.
- * @see #startEntity
- */
- public void endEntity(String name) throws org.xml.sax.SAXException {
- }
-
- /**
- * Receive notivication of a entityReference.
- *
- * @param name
- * name of the entity reference
- */
- public void entityReference(String name) throws org.xml.sax.SAXException {
- append(m_doc.createEntityReference(name));
- }
-
- /**
- * Receive notification of ignorable whitespace in element content.
- *
- * <p>
- * Validating Parsers must use this method to report each chunk of ignorable
- * whitespace (see the W3C XML 1.0 recommendation, section 2.10):
- * non-validating parsers may also use this method if they are capable of
- * parsing and using content models.
- * </p>
- *
- * <p>
- * SAX parsers may return all contiguous whitespace in a single chunk, or they
- * may split it into several chunks; however, all of the characters in any
- * single event must come from the same external entity, so that the Locator
- * provides useful information.
- * </p>
- *
- * <p>
- * The application must not attempt to read from the array outside of the
- * specified range.
- * </p>
- *
- * @param ch
- * The characters from the XML document.
- * @param start
- * The start position in the array.
- * @param length
- * The number of characters to read from the array.
- * @see #characters
- */
- public void ignorableWhitespace(char ch[], int start, int length)
- throws org.xml.sax.SAXException {
- if (isOutsideDocElem())
- return; // avoid DOM006 Hierarchy request error
-
- String s = new String(ch, start, length);
-
- append(m_doc.createTextNode(s));
- }
-
- /**
- * Tell if the current node is outside the document element.
- *
- * @return true if the current node is outside the document element.
- */
- private boolean isOutsideDocElem() {
- return (null == m_docFrag)
- && m_elemStack.size() == 0
- && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE);
- }
-
- /**
- * Receive notification of a processing instruction.
- *
- * <p>
- * The Parser will invoke this method once for each processing instruction
- * found: note that processing instructions may occur before or after the main
- * document element.
- * </p>
- *
- * <p>
- * A SAX parser should never report an XML declaration (XML 1.0, section 2.8)
- * or a text declaration (XML 1.0, section 4.3.1) using this method.
- * </p>
- *
- * @param target
- * The processing instruction target.
- * @param data
- * The processing instruction data, or null if none was supplied.
- */
- public void processingInstruction(String target, String data)
- throws org.xml.sax.SAXException {
- append(m_doc.createProcessingInstruction(target, data));
- }
-
- /**
- * Report an XML comment anywhere in the document.
- *
- * This callback will be used for comments inside or outside the document
- * element, including comments in the external DTD subset (if read).
- *
- * @param ch
- * An array holding the characters in the comment.
- * @param start
- * The starting position in the array.
- * @param length
- * The number of characters to use from the array.
- */
- public void comment(char ch[], int start, int length)
- throws org.xml.sax.SAXException {
- // tagsoup sometimes submits invalid values here
- if (ch == null || start < 0 || length >= (ch.length - start) || length < 0)
- return;
- append(m_doc.createComment(new String(ch, start, length)));
- }
-
- /** Flag indicating that we are processing a CData section */
- protected boolean m_inCData = false;
-
- /**
- * Report the start of a CDATA section.
- *
- * @see #endCDATA
- */
- public void startCDATA() throws org.xml.sax.SAXException {
- m_inCData = true;
- append(m_doc.createCDATASection(""));
- }
-
- /**
- * Report the end of a CDATA section.
- *
- * @see #startCDATA
- */
- public void endCDATA() throws org.xml.sax.SAXException {
- m_inCData = false;
- }
-
- /**
- * Receive notification of cdata.
- *
- * <p>
- * The Parser will call this method to report each chunk of character data.
- * SAX parsers may return all contiguous character data in a single chunk, or
- * they may split it into several chunks; however, all of the characters in
- * any single event must come from the same external entity, so that the
- * Locator provides useful information.
- * </p>
- *
- * <p>
- * The application must not attempt to read from the array outside of the
- * specified range.
- * </p>
- *
- * <p>
- * Note that some parsers will report whitespace using the
- * ignorableWhitespace() method rather than this one (validating parsers must
- * do so).
- * </p>
- *
- * @param ch
- * The characters from the XML document.
- * @param start
- * The start position in the array.
- * @param length
- * The number of characters to read from the array.
- * @see #ignorableWhitespace
- * @see org.xml.sax.Locator
- */
- public void cdata(char ch[], int start, int length)
- throws org.xml.sax.SAXException {
- if (isOutsideDocElem()
- && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
- return; // avoid DOM006 Hierarchy request error
-
- String s = new String(ch, start, length);
-
- // XXX ab@apache.org: modified from the original, to accomodate TagSoup.
- Node n = m_currentNode.getLastChild();
- if (n instanceof CDATASection)
- ((CDATASection) n).appendData(s);
- else if (n instanceof Comment)
- ((Comment) n).appendData(s);
- }
-
- /**
- * Report the start of DTD declarations, if any.
- *
- * Any declarations are assumed to be in the internal subset unless otherwise
- * indicated.
- *
- * @param name
- * The document type name.
- * @param publicId
- * The declared public identifier for the external DTD subset, or
- * null if none was declared.
- * @param systemId
- * The declared system identifier for the external DTD subset, or
- * null if none was declared.
- * @see #endDTD
- * @see #startEntity
- */
- public void startDTD(String name, String publicId, String systemId)
- throws org.xml.sax.SAXException {
-
- // Do nothing for now.
- }
-
- /**
- * Report the end of DTD declarations.
- *
- * @see #startDTD
- */
- public void endDTD() throws org.xml.sax.SAXException {
-
- // Do nothing for now.
- }
-
- /**
- * Begin the scope of a prefix-URI Namespace mapping.
- *
- * <p>
- * The information from this event is not necessary for normal Namespace
- * processing: the SAX XML reader will automatically replace prefixes for
- * element and attribute names when the http://xml.org/sax/features/namespaces
- * feature is true (the default).
- * </p>
- *
- * <p>
- * There are cases, however, when applications need to use prefixes in
- * character data or in attribute values, where they cannot safely be expanded
- * automatically; the start/endPrefixMapping event supplies the information to
- * the application to expand prefixes in those contexts itself, if necessary.
- * </p>
- *
- * <p>
- * Note that start/endPrefixMapping events are not guaranteed to be properly
- * nested relative to each-other: all startPrefixMapping events will occur
- * before the corresponding startElement event, and all endPrefixMapping
- * events will occur after the corresponding endElement event, but their order
- * is not guaranteed.
- * </p>
- *
- * @param prefix
- * The Namespace prefix being declared.
- * @param uri
- * The Namespace URI the prefix is mapped to.
- * @see #endPrefixMapping
- * @see #startElement
- */
- public void startPrefixMapping(String prefix, String uri)
- throws org.xml.sax.SAXException {
-
- /*
- * // Not sure if this is needed or wanted // Also, it fails in the stree.
- * if((null != m_currentNode) && (m_currentNode.getNodeType() ==
- * Node.ELEMENT_NODE)) { String qname; if(((null != prefix) &&
- * (prefix.length() == 0)) || (null == prefix)) qname = "xmlns"; else qname
- * = "xmlns:"+prefix;
- *
- * Element elem = (Element)m_currentNode; String val =
- * elem.getAttribute(qname); // Obsolete, should be DOM2...? if(val == null)
- * { elem.setAttributeNS("http://www.w3.org/XML/1998/namespace", qname,
- * uri); } }
- */
- }
-
- /**
- * End the scope of a prefix-URI mapping.
- *
- * <p>
- * See startPrefixMapping for details. This event will always occur after the
- * corresponding endElement event, but the order of endPrefixMapping events is
- * not otherwise guaranteed.
- * </p>
- *
- * @param prefix
- * The prefix that was being mapping.
- * @see #startPrefixMapping
- * @see #endElement
- */
- public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException {
- }
-
- /**
- * Receive notification of a skipped entity.
- *
- * <p>
- * The Parser will invoke this method once for each entity skipped.
- * Non-validating processors may skip entities if they have not seen the
- * declarations (because, for example, the entity was declared in an external
- * DTD subset). All processors may skip external entities, depending on the
- * values of the http://xml.org/sax/features/external-general-entities and the
- * http://xml.org/sax/features/external-parameter-entities properties.
- * </p>
- *
- * @param name
- * The name of the skipped entity. If it is a parameter entity, the
- * name will begin with '%'.
- */
- public void skippedEntity(String name) throws org.xml.sax.SAXException {
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
deleted file mode 100644
index 3c2aba0..0000000
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
+++ /dev/null
@@ -1,400 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.html;
-
-import java.net.URL;
-import java.net.MalformedURLException;
-import java.util.Collection;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Stack;
-
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.util.NodeWalker;
-import org.apache.nutch.util.URLUtil;
-import org.apache.hadoop.conf.Configuration;
-
-import org.w3c.dom.*;
-
-/**
- * A collection of methods for extracting content from DOM trees.
- *
- * This class holds a few utility methods for pulling content out of DOM nodes,
- * such as getOutlinks, getText, etc.
- *
- */
-public class DOMContentUtils {
-
- public static class LinkParams {
- public String elName;
- public String attrName;
- public int childLen;
-
- public LinkParams(String elName, String attrName, int childLen) {
- this.elName = elName;
- this.attrName = attrName;
- this.childLen = childLen;
- }
-
- public String toString() {
- return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]";
- }
- }
-
- private HashMap<String, LinkParams> linkParams = new HashMap<String, LinkParams>();
- private Configuration conf;
-
- public DOMContentUtils(Configuration conf) {
- setConf(conf);
- }
-
- public void setConf(Configuration conf) {
- // forceTags is used to override configurable tag ignoring, later on
- Collection<String> forceTags = new ArrayList<String>(1);
-
- this.conf = conf;
- linkParams.clear();
- linkParams.put("a", new LinkParams("a", "href", 1));
- linkParams.put("area", new LinkParams("area", "href", 0));
- if (conf.getBoolean("parser.html.form.use_action", true)) {
- linkParams.put("form", new LinkParams("form", "action", 1));
- if (conf.get("parser.html.form.use_action") != null)
- forceTags.add("form");
- }
- linkParams.put("frame", new LinkParams("frame", "src", 0));
- linkParams.put("iframe", new LinkParams("iframe", "src", 0));
- linkParams.put("script", new LinkParams("script", "src", 0));
- linkParams.put("link", new LinkParams("link", "href", 0));
- linkParams.put("img", new LinkParams("img", "src", 0));
-
- // remove unwanted link tags from the linkParams map
- String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags");
- for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) {
- if (!forceTags.contains(ignoreTags[i]))
- linkParams.remove(ignoreTags[i]);
- }
- }
-
- /**
- * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
- * append all the content text found beneath the DOM node to the
- * <code>StringBuffer</code>.
- *
- * <p>
- *
- * If <code>abortOnNestedAnchors</code> is true, DOM traversal will be aborted
- * and the <code>StringBuffer</code> will not contain any text encountered
- * after a nested anchor is found.
- *
- * <p>
- *
- * @return true if nested anchors were found
- */
- public boolean getText(StringBuffer sb, Node node,
- boolean abortOnNestedAnchors) {
- if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
- return true;
- }
- return false;
- }
-
- /**
- * This is a convinience method, equivalent to
- * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
- *
- */
- public void getText(StringBuffer sb, Node node) {
- getText(sb, node, false);
- }
-
- // returns true if abortOnNestedAnchors is true and we find nested
- // anchors
- private boolean getTextHelper(StringBuffer sb, Node node,
- boolean abortOnNestedAnchors, int anchorDepth) {
- boolean abort = false;
- NodeWalker walker = new NodeWalker(node);
-
- while (walker.hasNext()) {
-
- Node currentNode = walker.nextNode();
- String nodeName = currentNode.getNodeName();
- short nodeType = currentNode.getNodeType();
-
- if ("script".equalsIgnoreCase(nodeName)) {
- walker.skipChildren();
- }
- if ("style".equalsIgnoreCase(nodeName)) {
- walker.skipChildren();
- }
- if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
- anchorDepth++;
- if (anchorDepth > 1) {
- abort = true;
- break;
- }
- }
- if (nodeType == Node.COMMENT_NODE) {
- walker.skipChildren();
- }
- if (nodeType == Node.TEXT_NODE) {
- // cleanup and trim the value
- String text = currentNode.getNodeValue();
- text = text.replaceAll("\\s+", " ");
- text = text.trim();
- if (text.length() > 0) {
- if (sb.length() > 0)
- sb.append(' ');
- sb.append(text);
- }
- }
- }
-
- return abort;
- }
-
- /**
- * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
- * append the content text found beneath the first <code>title</code> node to
- * the <code>StringBuffer</code>.
- *
- * @return true if a title node was found, false otherwise
- */
- public boolean getTitle(StringBuffer sb, Node node) {
-
- NodeWalker walker = new NodeWalker(node);
-
- while (walker.hasNext()) {
-
- Node currentNode = walker.nextNode();
- String nodeName = currentNode.getNodeName();
- short nodeType = currentNode.getNodeType();
-
- if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
- return false;
- }
-
- if (nodeType == Node.ELEMENT_NODE) {
- if ("title".equalsIgnoreCase(nodeName)) {
- getText(sb, currentNode);
- return true;
- }
- }
- }
-
- return false;
- }
-
- /** If Node contains a BASE tag then it's HREF is returned. */
- public URL getBase(Node node) {
-
- NodeWalker walker = new NodeWalker(node);
-
- while (walker.hasNext()) {
-
- Node currentNode = walker.nextNode();
- String nodeName = currentNode.getNodeName();
- short nodeType = currentNode.getNodeType();
-
- // is this node a BASE tag?
- if (nodeType == Node.ELEMENT_NODE) {
-
- if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
- return null;
- }
-
- if ("base".equalsIgnoreCase(nodeName)) {
- NamedNodeMap attrs = currentNode.getAttributes();
- for (int i = 0; i < attrs.getLength(); i++) {
- Node attr = attrs.item(i);
- if ("href".equalsIgnoreCase(attr.getNodeName())) {
- try {
- return new URL(attr.getNodeValue());
- } catch (MalformedURLException e) {
- }
- }
- }
- }
- }
- }
-
- // no.
- return null;
- }
-
- private boolean hasOnlyWhiteSpace(Node node) {
- String val = node.getNodeValue();
- for (int i = 0; i < val.length(); i++) {
- if (!Character.isWhitespace(val.charAt(i)))
- return false;
- }
- return true;
- }
-
- // this only covers a few cases of empty links that are symptomatic
- // of nekohtml's DOM-fixup process...
- private boolean shouldThrowAwayLink(Node node, NodeList children,
- int childLen, LinkParams params) {
- if (childLen == 0) {
- // this has no inner structure
- if (params.childLen == 0)
- return false;
- else
- return true;
- } else if ((childLen == 1)
- && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
- && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
- // single nested link
- return true;
-
- } else if (childLen == 2) {
-
- Node c0 = children.item(0);
- Node c1 = children.item(1);
-
- if ((c0.getNodeType() == Node.ELEMENT_NODE)
- && (params.elName.equalsIgnoreCase(c0.getNodeName()))
- && (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) {
- // single link followed by whitespace node
- return true;
- }
-
- if ((c1.getNodeType() == Node.ELEMENT_NODE)
- && (params.elName.equalsIgnoreCase(c1.getNodeName()))
- && (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) {
- // whitespace node followed by single link
- return true;
- }
-
- } else if (childLen == 3) {
- Node c0 = children.item(0);
- Node c1 = children.item(1);
- Node c2 = children.item(2);
-
- if ((c1.getNodeType() == Node.ELEMENT_NODE)
- && (params.elName.equalsIgnoreCase(c1.getNodeName()))
- && (c0.getNodeType() == Node.TEXT_NODE)
- && (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)
- && hasOnlyWhiteSpace(c2)) {
- // single link surrounded by whitespace nodes
- return true;
- }
- }
-
- return false;
- }
-
- /**
- * This method finds all anchors below the supplied DOM <code>node</code>, and
- * creates appropriate {@link Outlink} records for each (relative to the
- * supplied <code>base</code> URL), and adds them to the <code>outlinks</code>
- * {@link ArrayList}.
- *
- * <p>
- *
- * Links without inner structure (tags, text, etc) are discarded, as are links
- * which contain only single nested links and empty text nodes (this is a
- * common DOM-fixup artifact, at least with nekohtml).
- */
- public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
-
- NodeWalker walker = new NodeWalker(node);
- while (walker.hasNext()) {
-
- Node currentNode = walker.nextNode();
- String nodeName = currentNode.getNodeName();
- short nodeType = currentNode.getNodeType();
- NodeList children = currentNode.getChildNodes();
- int childLen = (children != null) ? children.getLength() : 0;
-
- if (nodeType == Node.ELEMENT_NODE) {
-
- nodeName = nodeName.toLowerCase();
- LinkParams params = (LinkParams) linkParams.get(nodeName);
- if (params != null) {
- if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
-
- StringBuffer linkText = new StringBuffer();
- getText(linkText, currentNode, true);
- if (linkText.toString().trim().length() == 0) {
- // try harder - use img alt if present
- NodeWalker subWalker = new NodeWalker(currentNode);
- while (subWalker.hasNext()) {
- Node subNode = subWalker.nextNode();
- if (subNode.getNodeType() == Node.ELEMENT_NODE) {
- if (subNode.getNodeName().toLowerCase().equals("img")) {
- NamedNodeMap subAttrs = subNode.getAttributes();
- Node alt = subAttrs.getNamedItem("alt");
- if (alt != null) {
- String altTxt = alt.getTextContent();
- if (altTxt != null && altTxt.trim().length() > 0) {
- if (linkText.length() > 0)
- linkText.append(' ');
- linkText.append(altTxt);
- }
- }
- } else {
- // ignore other types of elements
-
- }
- } else if (subNode.getNodeType() == Node.TEXT_NODE) {
- String txt = subNode.getTextContent();
- if (txt != null && txt.length() > 0) {
- if (linkText.length() > 0)
- linkText.append(' ');
- linkText.append(txt);
- }
- }
- }
- }
-
- NamedNodeMap attrs = currentNode.getAttributes();
- String target = null;
- boolean noFollow = false;
- boolean post = false;
- for (int i = 0; i < attrs.getLength(); i++) {
- Node attr = attrs.item(i);
- String attrName = attr.getNodeName();
- if (params.attrName.equalsIgnoreCase(attrName)) {
- target = attr.getNodeValue();
- } else if ("rel".equalsIgnoreCase(attrName)
- && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
- noFollow = true;
- } else if ("method".equalsIgnoreCase(attrName)
- && "post".equalsIgnoreCase(attr.getNodeValue())) {
- post = true;
- }
- }
- if (target != null && !noFollow && !post)
- try {
-
- URL url = URLUtil.resolveURL(base, target);
- outlinks.add(new Outlink(url.toString(), linkText.toString()
- .trim()));
- } catch (MalformedURLException e) {
- // don't care
- }
- }
- // this should not have any children, skip them
- if (params.childLen == 0)
- continue;
- }
- }
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
deleted file mode 100644
index 159aa76..0000000
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
+++ /dev/null
@@ -1,214 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.html;
-
-import java.net.URL;
-
-import org.apache.nutch.parse.HTMLMetaTags;
-import org.w3c.dom.*;
-
-/**
- * Class for parsing META Directives from DOM trees. This class handles
- * specifically Robots META directives (all, none, nofollow, noindex), finding
- * BASE HREF tags, and HTTP-EQUIV no-cache instructions. All meta directives are
- * stored in a HTMLMetaTags instance.
- */
-public class HTMLMetaProcessor {
-
- /**
- * Utility class with indicators for the robots directives "noindex" and
- * "nofollow", and HTTP-EQUIV/no-cache
- */
-
- /**
- * Sets the indicators in <code>robotsMeta</code> to appropriate values, based
- * on any META tags found under the given <code>node</code>.
- */
- public static final void getMetaTags(HTMLMetaTags metaTags, Node node,
- URL currURL) {
-
- metaTags.reset();
- getMetaTagsHelper(metaTags, node, currURL);
- }
-
- private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node,
- URL currURL) {
-
- if (node.getNodeType() == Node.ELEMENT_NODE) {
-
- if ("body".equalsIgnoreCase(node.getNodeName())) {
- // META tags should not be under body
- return;
- }
-
- if ("meta".equalsIgnoreCase(node.getNodeName())) {
- NamedNodeMap attrs = node.getAttributes();
- Node nameNode = null;
- Node equivNode = null;
- Node contentNode = null;
- // Retrieves name, http-equiv and content attribues
- for (int i = 0; i < attrs.getLength(); i++) {
- Node attr = attrs.item(i);
- String attrName = attr.getNodeName().toLowerCase();
- if (attrName.equals("name")) {
- nameNode = attr;
- } else if (attrName.equals("http-equiv")) {
- equivNode = attr;
- } else if (attrName.equals("content")) {
- contentNode = attr;
- }
- }
-
- if (nameNode != null) {
- if (contentNode != null) {
- String name = nameNode.getNodeValue().toLowerCase();
- metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
- if ("robots".equals(name)) {
-
- if (contentNode != null) {
- String directives = contentNode.getNodeValue().toLowerCase();
- int index = directives.indexOf("none");
-
- if (index >= 0) {
- metaTags.setNoIndex();
- metaTags.setNoFollow();
- }
-
- index = directives.indexOf("all");
- if (index >= 0) {
- // do nothing...
- }
-
- index = directives.indexOf("noindex");
- if (index >= 0) {
- metaTags.setNoIndex();
- }
-
- index = directives.indexOf("nofollow");
- if (index >= 0) {
- metaTags.setNoFollow();
- }
-
- index = directives.indexOf("noarchive");
- if (index >= 0) {
- metaTags.setNoCache();
- }
- }
-
- } // end if (name == robots)
- }
- }
-
- if (equivNode != null) {
- if (contentNode != null) {
- String name = equivNode.getNodeValue().toLowerCase();
- String content = contentNode.getNodeValue();
- metaTags.getHttpEquivTags().setProperty(name, content);
- if ("pragma".equals(name)) {
- content = content.toLowerCase();
- int index = content.indexOf("no-cache");
- if (index >= 0)
- metaTags.setNoCache();
- } else if ("refresh".equals(name)) {
- int idx = content.indexOf(';');
- String time = null;
- if (idx == -1) { // just the refresh time
- time = content;
- } else
- time = content.substring(0, idx);
- try {
- metaTags.setRefreshTime(Integer.parseInt(time));
- // skip this if we couldn't parse the time
- metaTags.setRefresh(true);
- } catch (Exception e) {
- ;
- }
- URL refreshUrl = null;
- if (metaTags.getRefresh() && idx != -1) { // set the URL
- idx = content.toLowerCase().indexOf("url=");
- if (idx == -1) { // assume a mis-formatted entry with just the
- // url
- idx = content.indexOf(';') + 1;
- } else
- idx += 4;
- if (idx != -1) {
- String url = content.substring(idx);
- try {
- refreshUrl = new URL(url);
- } catch (Exception e) {
- // XXX according to the spec, this has to be an absolute
- // XXX url. However, many websites use relative URLs and
- // XXX expect browsers to handle that.
- // XXX Unfortunately, in some cases this may create a
- // XXX infinitely recursive paths (a crawler trap)...
- // if (!url.startsWith("/")) url = "/" + url;
- try {
- refreshUrl = new URL(currURL, url);
- } catch (Exception e1) {
- refreshUrl = null;
- }
- }
- }
- }
- if (metaTags.getRefresh()) {
- if (refreshUrl == null) {
- // apparently only refresh time was present. set the URL
- // to the same URL.
- refreshUrl = currURL;
- }
- metaTags.setRefreshHref(refreshUrl);
- }
- }
- }
- }
-
- } else if ("base".equalsIgnoreCase(node.getNodeName())) {
- NamedNodeMap attrs = node.getAttributes();
- Node hrefNode = attrs.getNamedItem("href");
-
- if (hrefNode != null) {
- String urlString = hrefNode.getNodeValue();
-
- URL url = null;
- try {
- if (currURL == null)
- url = new URL(urlString);
- else
- url = new URL(currURL, urlString);
- } catch (Exception e) {
- ;
- }
-
- if (url != null)
- metaTags.setBaseHref(url);
- }
-
- }
-
- }
-
- NodeList children = node.getChildNodes();
- if (children != null) {
- int len = children.getLength();
- for (int i = 0; i < len; i++) {
- getMetaTagsHelper(metaTags, children.item(i), currURL);
- }
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
deleted file mode 100644
index 4d043ba..0000000
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
+++ /dev/null
@@ -1,352 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.html;
-
-import java.util.ArrayList;
-import java.util.Map;
-import java.net.URL;
-import java.net.MalformedURLException;
-import java.nio.charset.StandardCharsets;
-import java.io.*;
-import java.util.regex.*;
-
-import org.cyberneko.html.parsers.*;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-import org.w3c.dom.*;
-import org.apache.html.dom.*;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.protocol.Content;
-import org.apache.hadoop.conf.*;
-import org.apache.nutch.parse.*;
-import org.apache.nutch.util.*;
-
-public class HtmlParser implements Parser {
- public static final Logger LOG = LoggerFactory
- .getLogger("org.apache.nutch.parse.html");
-
- // I used 1000 bytes at first, but found that some documents have
- // meta tag well past the first 1000 bytes.
- // (e.g. http://cn.promo.yahoo.com/customcare/music.html)
- // NUTCH-2042 (cf. TIKA-357): increased to 8 kB
- private static final int CHUNK_SIZE = 8192;
-
- // NUTCH-1006 Meta equiv with single quotes not accepted
- private static Pattern metaPattern = Pattern.compile(
- "<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>",
- Pattern.CASE_INSENSITIVE);
- private static Pattern charsetPattern = Pattern.compile(
- "charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE);
- private static Pattern charsetPatternHTML5 = Pattern.compile(
- "<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>",
- Pattern.CASE_INSENSITIVE);
-
- private String parserImpl;
-
- /**
- * Given a <code>byte[]</code> representing an html file of an
- * <em>unknown</em> encoding, read out 'charset' parameter in the meta tag
- * from the first <code>CHUNK_SIZE</code> bytes. If there's no meta tag for
- * Content-Type or no charset is specified, the content is checked for a
- * Unicode Byte Order Mark (BOM). This will also cover non-byte oriented
- * character encodings (UTF-16 only). If no character set can be determined,
- * <code>null</code> is returned. <br />
- * See also
- * http://www.w3.org/International/questions/qa-html-encoding-declarations,
- * http://www.w3.org/TR/2011/WD-html5-diff-20110405/#character-encoding, and
- * http://www.w3.org/TR/REC-xml/#sec-guessing
- *
- * @param content
- * <code>byte[]</code> representation of an html file
- */
-
- private static String sniffCharacterEncoding(byte[] content) {
- int length = content.length < CHUNK_SIZE ? content.length : CHUNK_SIZE;
-
- // We don't care about non-ASCII parts so that it's sufficient
- // to just inflate each byte to a 16-bit value by padding.
- // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
- // {U+0041, U+0082, U+00B7}.
- String str = new String(content, 0, length, StandardCharsets.US_ASCII);
-
- Matcher metaMatcher = metaPattern.matcher(str);
- String encoding = null;
- if (metaMatcher.find()) {
- Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
- if (charsetMatcher.find())
- encoding = new String(charsetMatcher.group(1));
- }
- if (encoding == null) {
- // check for HTML5 meta charset
- metaMatcher = charsetPatternHTML5.matcher(str);
- if (metaMatcher.find()) {
- encoding = new String(metaMatcher.group(1));
- }
- }
- if (encoding == null) {
- // check for BOM
- if (content.length >= 3 && content[0] == (byte) 0xEF
- && content[1] == (byte) 0xBB && content[2] == (byte) 0xBF) {
- encoding = "UTF-8";
- } else if (content.length >= 2) {
- if (content[0] == (byte) 0xFF && content[1] == (byte) 0xFE) {
- encoding = "UTF-16LE";
- } else if (content[0] == (byte) 0xFE && content[1] == (byte) 0xFF) {
- encoding = "UTF-16BE";
- }
- }
- }
-
- return encoding;
- }
-
- private String defaultCharEncoding;
-
- private Configuration conf;
-
- private DOMContentUtils utils;
-
- private HtmlParseFilters htmlParseFilters;
-
- private String cachingPolicy;
-
- public ParseResult getParse(Content content) {
- HTMLMetaTags metaTags = new HTMLMetaTags();
-
- URL base;
- try {
- base = new URL(content.getBaseUrl());
- } catch (MalformedURLException e) {
- return new ParseStatus(e)
- .getEmptyParseResult(content.getUrl(), getConf());
- }
-
- String text = "";
- String title = "";
- Outlink[] outlinks = new Outlink[0];
- Metadata metadata = new Metadata();
-
- // parse the content
- DocumentFragment root;
- try {
- byte[] contentInOctets = content.getContent();
- InputSource input = new InputSource(new ByteArrayInputStream(
- contentInOctets));
-
- EncodingDetector detector = new EncodingDetector(conf);
- detector.autoDetectClues(content, true);
- detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
- String encoding = detector.guessEncoding(content, defaultCharEncoding);
-
- metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
- metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
-
- input.setEncoding(encoding);
- if (LOG.isTraceEnabled()) {
- LOG.trace("Parsing...");
- }
- root = parse(input);
- } catch (IOException e) {
- return new ParseStatus(e)
- .getEmptyParseResult(content.getUrl(), getConf());
- } catch (DOMException e) {
- return new ParseStatus(e)
- .getEmptyParseResult(content.getUrl(), getConf());
- } catch (SAXException e) {
- return new ParseStatus(e)
- .getEmptyParseResult(content.getUrl(), getConf());
- } catch (Exception e) {
- LOG.error("Error: ", e);
- return new ParseStatus(e)
- .getEmptyParseResult(content.getUrl(), getConf());
- }
-
- // get meta directives
- HTMLMetaProcessor.getMetaTags(metaTags, root, base);
-
- // populate Nutch metadata with HTML meta directives
- metadata.addAll(metaTags.getGeneralTags());
-
- if (LOG.isTraceEnabled()) {
- LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
- }
- // check meta directives
- if (!metaTags.getNoIndex()) { // okay to index
- StringBuffer sb = new StringBuffer();
- if (LOG.isTraceEnabled()) {
- LOG.trace("Getting text...");
- }
- utils.getText(sb, root); // extract text
- text = sb.toString();
- sb.setLength(0);
- if (LOG.isTraceEnabled()) {
- LOG.trace("Getting title...");
- }
- utils.getTitle(sb, root); // extract title
- title = sb.toString().trim();
- }
-
- if (!metaTags.getNoFollow()) { // okay to follow links
- ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
- URL baseTag = utils.getBase(root);
- if (LOG.isTraceEnabled()) {
- LOG.trace("Getting links...");
- }
- utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
- outlinks = l.toArray(new Outlink[l.size()]);
- if (LOG.isTraceEnabled()) {
- LOG.trace("found " + outlinks.length + " outlinks in "
- + content.getUrl());
- }
- }
-
- ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
- if (metaTags.getRefresh()) {
- status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
- status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
- Integer.toString(metaTags.getRefreshTime()) });
- }
- ParseData parseData = new ParseData(status, title, outlinks,
- content.getMetadata(), metadata);
- ParseResult parseResult = ParseResult.createParseResult(content.getUrl(),
- new ParseImpl(text, parseData));
-
- // run filters on parse
- ParseResult filteredParse = this.htmlParseFilters.filter(content,
- parseResult, metaTags, root);
- if (metaTags.getNoCache()) { // not okay to cache
- for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
- entry.getValue().getData().getParseMeta()
- .set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
- }
- return filteredParse;
- }
-
- private DocumentFragment parse(InputSource input) throws Exception {
- if (parserImpl.equalsIgnoreCase("tagsoup"))
- return parseTagSoup(input);
- else
- return parseNeko(input);
- }
-
- private DocumentFragment parseTagSoup(InputSource input) throws Exception {
- HTMLDocumentImpl doc = new HTMLDocumentImpl();
- DocumentFragment frag = doc.createDocumentFragment();
- DOMBuilder builder = new DOMBuilder(doc, frag);
- org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser();
- reader.setContentHandler(builder);
- reader.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
- reader.setFeature(org.ccil.cowan.tagsoup.Parser.bogonsEmptyFeature, false);
- reader
- .setProperty("http://xml.org/sax/properties/lexical-handler", builder);
- reader.parse(input);
- return frag;
- }
-
- private DocumentFragment parseNeko(InputSource input) throws Exception {
- DOMFragmentParser parser = new DOMFragmentParser();
- try {
- parser
- .setFeature(
- "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
- true);
- parser.setFeature("http://cyberneko.org/html/features/augmentations",
- true);
- parser.setProperty(
- "http://cyberneko.org/html/properties/default-encoding",
- defaultCharEncoding);
- parser
- .setFeature(
- "http://cyberneko.org/html/features/scanner/ignore-specified-charset",
- true);
- parser
- .setFeature(
- "http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
- false);
- parser.setFeature(
- "http://cyberneko.org/html/features/balance-tags/document-fragment",
- true);
- parser.setFeature("http://cyberneko.org/html/features/report-errors",
- LOG.isTraceEnabled());
- } catch (SAXException e) {
- }
- // convert Document to DocumentFragment
- HTMLDocumentImpl doc = new HTMLDocumentImpl();
- doc.setErrorChecking(false);
- DocumentFragment res = doc.createDocumentFragment();
- DocumentFragment frag = doc.createDocumentFragment();
- parser.parse(input, frag);
- res.appendChild(frag);
-
- try {
- while (true) {
- frag = doc.createDocumentFragment();
- parser.parse(input, frag);
- if (!frag.hasChildNodes())
- break;
- if (LOG.isInfoEnabled()) {
- LOG.info(" - new frag, " + frag.getChildNodes().getLength()
- + " nodes.");
- }
- res.appendChild(frag);
- }
- } catch (Exception e) {
- LOG.error("Error: ", e);
- }
- ;
- return res;
- }
-
- public static void main(String[] args) throws Exception {
- // LOG.setLevel(Level.FINE);
- String name = args[0];
- String url = "file:" + name;
- File file = new File(name);
- byte[] bytes = new byte[(int) file.length()];
- DataInputStream in = new DataInputStream(new FileInputStream(file));
- in.readFully(bytes);
- Configuration conf = NutchConfiguration.create();
- HtmlParser parser = new HtmlParser();
- parser.setConf(conf);
- Parse parse = parser.getParse(
- new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(
- url);
- System.out.println("data: " + parse.getData());
-
- System.out.println("text: " + parse.getText());
-
- }
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- this.htmlParseFilters = new HtmlParseFilters(getConf());
- this.parserImpl = getConf().get("parser.html.impl", "neko");
- this.defaultCharEncoding = getConf().get(
- "parser.character.encoding.default", "windows-1252");
- this.utils = new DOMContentUtils(conf);
- this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
- Nutch.CACHING_FORBIDDEN_CONTENT);
- }
-
- public Configuration getConf() {
- return this.conf;
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
deleted file mode 100644
index eb382e8..0000000
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
- * XXX distribution, org.apache.xml.utils.XMLCharacterRecognizer,
- * XXX in order to avoid dependency on Xalan.
- */
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Id$
- */
-package org.apache.nutch.parse.html;
-
-/**
- * Class used to verify whether the specified <var>ch</var> conforms to the XML
- * 1.0 definition of whitespace.
- */
-public class XMLCharacterRecognizer {
-
- /**
- * Returns whether the specified <var>ch</var> conforms to the XML 1.0
- * definition of whitespace. Refer to <A
- * href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S"> the definition of
- * <CODE>S</CODE></A> for details.
- *
- * @param ch
- * Character to check as XML whitespace.
- * @return =true if <var>ch</var> is XML whitespace; otherwise =false.
- */
- public static boolean isWhiteSpace(char ch) {
- return (ch == 0x20) || (ch == 0x09) || (ch == 0xD) || (ch == 0xA);
- }
-
- /**
- * Tell if the string is whitespace.
- *
- * @param ch
- * Character array to check as XML whitespace.
- * @param start
- * Start index of characters in the array
- * @param length
- * Number of characters in the array
- * @return True if the characters in the array are XML whitespace; otherwise,
- * false.
- */
- public static boolean isWhiteSpace(char ch[], int start, int length) {
-
- int end = start + length;
-
- for (int s = start; s < end; s++) {
- if (!isWhiteSpace(ch[s]))
- return false;
- }
-
- return true;
- }
-
- /**
- * Tell if the string is whitespace.
- *
- * @param buf
- * StringBuffer to check as XML whitespace.
- * @return True if characters in buffer are XML whitespace, false otherwise
- */
- public static boolean isWhiteSpace(StringBuffer buf) {
-
- int n = buf.length();
-
- for (int i = 0; i < n; i++) {
- if (!isWhiteSpace(buf.charAt(i)))
- return false;
- }
-
- return true;
- }
-
- /**
- * Tell if the string is whitespace.
- *
- * @param s
- * String to check as XML whitespace.
- * @return True if characters in buffer are XML whitespace, false otherwise
- */
- public static boolean isWhiteSpace(String s) {
-
- if (null != s) {
- int n = s.length();
-
- for (int i = 0; i < n; i++) {
- if (!isWhiteSpace(s.charAt(i)))
- return false;
- }
- }
-
- return true;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/package.html
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/package.html b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/package.html
deleted file mode 100644
index c650389..0000000
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/package.html
+++ /dev/null
@@ -1,5 +0,0 @@
-<html>
-<body>
-<p>An HTML document parsing plugin.</p><p>This package relies on <a href="http://www.apache.org/~andyc/neko/doc/html/index.html">NekoHTML</a>.</p>
-</body>
-</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
deleted file mode 100644
index 15725ae..0000000
--- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
+++ /dev/null
@@ -1,347 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.html;
-
-import org.apache.nutch.parse.Outlink;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-
-import java.io.ByteArrayInputStream;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.StringTokenizer;
-
-import org.cyberneko.html.parsers.*;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-import org.xml.sax.*;
-import org.w3c.dom.*;
-import org.apache.html.dom.*;
-
-/**
- * Unit tests for DOMContentUtils.
- */
-public class TestDOMContentUtils {
-
- private static final String[] testPages = {
- new String("<html><head><title> title </title><script> script </script>"
- + "</head><body> body <a href=\"http://www.nutch.org\">"
- + " anchor </a><!--comment-->" + "</body></html>"),
- new String("<html><head><title> title </title><script> script </script>"
- + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->"
- + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>"
- + "</body></html>"),
- new String("<html><head><title> </title>" + "</head><body> "
- + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
- + "</a></a>" + "</body></html>"),
- // this one relies on certain neko fixup behavior, possibly
- // distributing the anchors into the LI's-but not the other
- // anchors (outside of them, instead)! So you get a tree that
- // looks like:
- // ... <li> <a href=/> home </a> </li>
- // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
- // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
- new String("<html><head><title> my title </title>"
- + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home"
- + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>"
- + "</body></html>"),
- // test frameset link extraction. The invalid frame in the middle will be
- // fixed to a third standalone frame.
- new String("<html><head><title> my title </title>"
- + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">"
- + "</frame>" + "<frameset cols=\"20,*\">"
- + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>"
- + "</frame>" + "<frame src=\"right.html\">" + "</frame>"
- + "</frameset>" + "</frameset>" + "</body></html>"),
- // test <area> and <iframe> link extraction + url normalization
- new String(
- "<html><head><title> my title </title>"
- + "</head><body>"
- + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
- + "<map name=\"green\">"
- + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
- + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
- + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
- + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> "
- + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"),
- // test whitespace processing for plain text extraction
- new String(
- "<html><head>\n <title> my\t\n title\r\n </title>\n"
- + " </head>\n"
- + " <body>\n"
- + " <h1> Whitespace\ttest </h1> \n"
- + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n"
- + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
- + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
- + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n"
- + "<table>"
- + " <tr><td>one</td><td>two</td><td>three</td></tr>\n"
- + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
- + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
- + "</table>put some text here<Br>and there."
- + "<h2>End\tthis\rmadness\n!</h2>\r\n"
- + " . . . ." + "</body> </html>"),
-
- // test that <a rel=nofollow> links are not returned
- new String("<html><head></head><body>"
- + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
- + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
- + "</body></html>"),
- // test that POST form actions are skipped
- new String("<html><head></head><body>"
- + "<form method='POST' action='/search.jsp'><input type=text>"
- + "<input type=submit><p>test1</p></form>"
- + "<form method='GET' action='/dummy.jsp'><input type=text>"
- + "<input type=submit><p>test2</p></form></body></html>"),
- // test that all form actions are skipped
- new String("<html><head></head><body>"
- + "<form method='POST' action='/search.jsp'><input type=text>"
- + "<input type=submit><p>test1</p></form>"
- + "<form method='GET' action='/dummy.jsp'><input type=text>"
- + "<input type=submit><p>test2</p></form></body></html>"),
- new String("<html><head><title> title </title>" + "</head><body>"
- + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>"
- + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
- new String("<html><head><title> title </title>" + "</head><body>"
- + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
- + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
- + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"),
- new String("<html><head><title> title </title>" + "</head><body>"
- + "<a href=\"g\"><!--no anchor--></a>"
- + "<a href=\"g1\"> <!--whitespace--> </a>"
- + "<a href=\"g2\"> <img src=test.gif alt='bla bla'> </a>"
- + "</body></html>"), };
-
- private static int SKIP = 9;
-
- private static String[] testBaseHrefs = { "http://www.nutch.org",
- "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/",
- "http://www.nutch.org/docs/", "http://www.nutch.org/frames/",
- "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/",
- "http://www.nutch.org//", "http://www.nutch.org/",
- "http://www.nutch.org/", "http://www.nutch.org/",
- "http://www.nutch.org/;something", "http://www.nutch.org/" };
-
- private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length];
-
- private static URL[] testBaseHrefURLs = new URL[testPages.length];
-
- private static final String[] answerText = {
- "title body anchor",
- "title body home bots",
- "separate this from this",
- "my title body home 1 2",
- "my title",
- "my title the bottom",
- "my title Whitespace test whitespace test "
- + "This is a whitespace test . Newlines should appear as space too. "
- + "Tabs are spaces too. This is a break -> and the line after break . "
- + "one two three space here space there no space "
- + "one two two three three four put some text here and there. "
- + "End this madness ! . . . .", "ignore ignore", "test1 test2",
- "test1 test2", "title anchor1 anchor2 anchor3",
- "title anchor1 anchor2 anchor3 anchor4 anchor5", "title" };
-
- private static final String[] answerTitle = { "title", "title", "",
- "my title", "my title", "my title", "my title", "", "", "", "title",
- "title", "title" };
-
- // note: should be in page-order
- private static Outlink[][] answerOutlinks;
-
- private static Configuration conf;
- private static DOMContentUtils utils = null;
-
- @Before
- public void setup() {
- conf = NutchConfiguration.create();
- conf.setBoolean("parser.html.form.use_action", true);
- utils = new DOMContentUtils(conf);
- DOMFragmentParser parser = new DOMFragmentParser();
- try {
- parser
- .setFeature(
- "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
- true);
- } catch (SAXException e) {
- }
- for (int i = 0; i < testPages.length; i++) {
- DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
- try {
- parser.parse(
- new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
- node);
- testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
- } catch (Exception e) {
- Assert.assertTrue("caught exception: " + e, false);
- }
- testDOMs[i] = node;
- }
- try {
- answerOutlinks = new Outlink[][] {
- { new Outlink("http://www.nutch.org", "anchor"), },
- { new Outlink("http://www.nutch.org/", "home"),
- new Outlink("http://www.nutch.org/docs/bot.html", "bots"), },
- { new Outlink("http://www.nutch.org/", "separate this"),
- new Outlink("http://www.nutch.org/docs/ok", "from this"), },
- { new Outlink("http://www.nutch.org/", "home"),
- new Outlink("http://www.nutch.org/docs/1", "1"),
- new Outlink("http://www.nutch.org/docs/2", "2"), },
- { new Outlink("http://www.nutch.org/frames/top.html", ""),
- new Outlink("http://www.nutch.org/frames/left.html", ""),
- new Outlink("http://www.nutch.org/frames/invalid.html", ""),
- new Outlink("http://www.nutch.org/frames/right.html", ""), },
- { new Outlink("http://www.nutch.org/maps/logo.gif", ""),
- new Outlink("http://www.nutch.org/index.html", ""),
- new Outlink("http://www.nutch.org/maps/#bottom", ""),
- new Outlink("http://www.nutch.org/bot.html", ""),
- new Outlink("http://www.nutch.org/docs/index.html", ""), },
- { new Outlink("http://www.nutch.org/index.html", "whitespace test"), },
- {},
- { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
- {},
- { new Outlink("http://www.nutch.org/;x", "anchor1"),
- new Outlink("http://www.nutch.org/g;x", "anchor2"),
- new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") },
- {
- // this is tricky - see RFC3986 section 5.4.1 example 7
- new Outlink("http://www.nutch.org/g", "anchor1"),
- new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
- new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
- new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
- new Outlink("http://www.nutch.org/;something?y=1;somethingelse",
- "anchor5") },
- { new Outlink("http://www.nutch.org/g", ""),
- new Outlink("http://www.nutch.org/g1", ""),
- new Outlink("http://www.nutch.org/g2", "bla bla"),
- new Outlink("http://www.nutch.org/test.gif", "bla bla"), } };
-
- } catch (MalformedURLException e) {
-
- }
- }
-
- private static boolean equalsIgnoreWhitespace(String s1, String s2) {
- StringTokenizer st1 = new StringTokenizer(s1);
- StringTokenizer st2 = new StringTokenizer(s2);
-
- while (st1.hasMoreTokens()) {
- if (!st2.hasMoreTokens())
- return false;
- if (!st1.nextToken().equals(st2.nextToken()))
- return false;
- }
- if (st2.hasMoreTokens())
- return false;
- return true;
- }
-
- @Test
- public void testGetText() {
- if (testDOMs[0] == null)
- setup();
- for (int i = 0; i < testPages.length; i++) {
- StringBuffer sb = new StringBuffer();
- utils.getText(sb, testDOMs[i]);
- String text = sb.toString();
- Assert.assertTrue(
- "expecting text: " + answerText[i]
- + System.getProperty("line.separator")
- + System.getProperty("line.separator") + "got text: " + text,
- equalsIgnoreWhitespace(answerText[i], text));
- }
- }
-
- @Test
- public void testGetTitle() {
- if (testDOMs[0] == null)
- setup();
- for (int i = 0; i < testPages.length; i++) {
- StringBuffer sb = new StringBuffer();
- utils.getTitle(sb, testDOMs[i]);
- String text = sb.toString();
- Assert.assertTrue(
- "expecting text: " + answerText[i]
- + System.getProperty("line.separator")
- + System.getProperty("line.separator") + "got text: " + text,
- equalsIgnoreWhitespace(answerTitle[i], text));
- }
- }
-
- @Test
- public void testGetOutlinks() {
- if (testDOMs[0] == null)
- setup();
- for (int i = 0; i < testPages.length; i++) {
- ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
- if (i == SKIP) {
- conf.setBoolean("parser.html.form.use_action", false);
- utils.setConf(conf);
- } else {
- conf.setBoolean("parser.html.form.use_action", true);
- utils.setConf(conf);
- }
- utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
- Outlink[] outlinkArr = new Outlink[outlinks.size()];
- outlinkArr = (Outlink[]) outlinks.toArray(outlinkArr);
- compareOutlinks(answerOutlinks[i], outlinkArr);
- }
- }
-
- private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
- for (int i = 0; i < o.length; i++) {
- sb.append(o[i].toString());
- sb.append(System.getProperty("line.separator"));
- }
- }
-
- private static final String outlinksString(Outlink[] o) {
- StringBuffer sb = new StringBuffer();
- appendOutlinks(sb, o);
- return sb.toString();
- }
-
- private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
- if (o1.length != o2.length) {
- Assert.assertTrue(
- "got wrong number of outlinks (expecting " + o1.length + ", got "
- + o2.length + ")" + System.getProperty("line.separator")
- + "answer: " + System.getProperty("line.separator")
- + outlinksString(o1) + System.getProperty("line.separator")
- + "got: " + System.getProperty("line.separator")
- + outlinksString(o2) + System.getProperty("line.separator"),
- false);
- }
-
- for (int i = 0; i < o1.length; i++) {
- if (!o1[i].equals(o2[i])) {
- Assert.assertTrue(
- "got wrong outlinks at position " + i
- + System.getProperty("line.separator") + "answer: "
- + System.getProperty("line.separator") + "'" + o1[i].getToUrl()
- + "', anchor: '" + o1[i].getAnchor() + "'"
- + System.getProperty("line.separator") + "got: "
- + System.getProperty("line.separator") + "'" + o2[i].getToUrl()
- + "', anchor: '" + o2[i].getAnchor() + "'", false);
-
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
deleted file mode 100644
index 7099f50..0000000
--- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
+++ /dev/null
@@ -1,122 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.html;
-
-import java.nio.charset.Charset;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.html.HtmlParser;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.Parser;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class TestHtmlParser {
-
- public static final Logger LOG = LoggerFactory
- .getLogger(TestHtmlParser.class);
-
- private static final String encodingTestKeywords = "fran�ais, espa�ol, \u0440\u0443\u0441\u0441\u043a\u0438\u0439 \u044f\u0437\u044b\u043a, \u010de\u0161tina, \u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ac";
- private static final String encodingTestBody = "<ul>\n <li>fran�ais\n <li>espa�ol\n <li>\u0440\u0443\u0441\u0441\u043a\u0438\u0439 \u044f\u0437\u044b\u043a\n <li>\u010de\u0161tina\n <li>\u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ac\n</ul>";
- private static final String encodingTestContent = "<title>"
- + encodingTestKeywords + "</title>\n"
- + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + "\" />\n"
- + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";
-
- private static String[][] encodingTestPages = {
- {
- "HTML4, utf-8, meta http-equiv, no quotes",
- "utf-8",
- "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
- + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
- + "<html>\n<head>\n"
- + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />"
- + encodingTestContent },
- {
- "HTML4, utf-8, meta http-equiv, single quotes",
- "utf-8",
- "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
- + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
- + "<html>\n<head>\n"
- + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />"
- + encodingTestContent },
- {
- "XHTML, utf-8, meta http-equiv, double quotes",
- "utf-8",
- "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
- + "<html>\n<head>\n"
- + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
- + encodingTestContent },
- {
- "HTML5, utf-8, meta charset",
- "utf-8",
- "<!DOCTYPE html>\n<html>\n<head>\n" + "<meta charset=\"utf-8\">"
- + encodingTestContent },
- { "HTML5, utf-8, BOM", "utf-8",
- "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent },
- { "HTML5, utf-16, BOM", "utf-16",
- "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } };
-
- private Configuration conf;
- private Parser parser;
-
- public TestHtmlParser() {
- conf = NutchConfiguration.create();
- parser = new HtmlParser();
- parser.setConf(conf);
- }
-
- protected Parse parse(byte[] contentBytes) {
- String dummyUrl = "http://dummy.url/";
- return parser.getParse(
- new Content(dummyUrl, dummyUrl, contentBytes, "text/html",
- new Metadata(), conf)).get(dummyUrl);
- }
-
- @Test
- public void testEncodingDetection() {
- for (String[] testPage : encodingTestPages) {
- String name = testPage[0];
- Charset charset = Charset.forName(testPage[1]);
- byte[] contentBytes = testPage[2].getBytes(charset);
- Parse parse = parse(contentBytes);
- String text = parse.getText();
- String title = parse.getData().getTitle();
- String keywords = parse.getData().getMeta("keywords");
- LOG.info(name);
- LOG.info("title:\t" + title);
- LOG.info("keywords:\t" + keywords);
- LOG.info("text:\t" + text);
- Assert.assertEquals("Title not extracted properly (" + name + ")",
- encodingTestKeywords, title);
- for (String keyword : encodingTestKeywords.split(",\\s*")) {
- Assert.assertTrue(keyword + " not found in text (" + name + ")",
- text.contains(keyword));
- }
- Assert.assertNotNull("No keywords extracted", keywords);
- Assert.assertEquals("Keywords not extracted properly (" + name + ")",
- encodingTestKeywords, keywords);
- }
- }
-
-}