You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/09 07:34:37 UTC
svn commit: r1650447 [16/25] - in /nutch/branches/2.x: ./
src/java/org/apache/nutch/api/ src/java/org/apache/nutch/api/impl/
src/java/org/apache/nutch/api/impl/db/
src/java/org/apache/nutch/api/model/response/
src/java/org/apache/nutch/api/resources/ s...
Modified: nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java (original)
+++ nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java Fri Jan 9 06:34:33 2015
@@ -39,136 +39,125 @@ import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.ext.LexicalHandler;
+
/**
- * This class takes SAX events (in addition to some extra events
- * that SAX doesn't handle yet) and adds the result to a document
- * or document fragment.
+ * This class takes SAX events (in addition to some extra events that SAX
+ * doesn't handle yet) and adds the result to a document or document fragment.
*/
-public class DOMBuilder
- implements ContentHandler, LexicalHandler
-{
+public class DOMBuilder implements ContentHandler, LexicalHandler {
- /** Root document */
+ /** Root document */
public Document m_doc;
- /** Current node */
+ /** Current node */
protected Node m_currentNode = null;
- /** First node of document fragment or null if not a DocumentFragment */
+ /** First node of document fragment or null if not a DocumentFragment */
public DocumentFragment m_docFrag = null;
- /** Vector of element nodes */
+ /** Vector of element nodes */
protected Stack<Element> m_elemStack = new Stack<Element>();
/**
- * DOMBuilder instance constructor... it will add the DOM nodes
- * to the document fragment.
- *
- * @param doc Root document
- * @param node Current node
+ * DOMBuilder instance constructor... it will add the DOM nodes to the
+ * document fragment.
+ *
+ * @param doc
+ * Root document
+ * @param node
+ * Current node
*/
- public DOMBuilder(Document doc, Node node)
- {
+ public DOMBuilder(Document doc, Node node) {
m_doc = doc;
m_currentNode = node;
}
/**
- * DOMBuilder instance constructor... it will add the DOM nodes
- * to the document fragment.
- *
- * @param doc Root document
- * @param docFrag Document fragment
+ * DOMBuilder instance constructor... it will add the DOM nodes to the
+ * document fragment.
+ *
+ * @param doc
+ * Root document
+ * @param docFrag
+ * Document fragment
*/
- public DOMBuilder(Document doc, DocumentFragment docFrag)
- {
+ public DOMBuilder(Document doc, DocumentFragment docFrag) {
m_doc = doc;
m_docFrag = docFrag;
}
/**
- * DOMBuilder instance constructor... it will add the DOM nodes
- * to the document.
- *
- * @param doc Root document
+ * DOMBuilder instance constructor... it will add the DOM nodes to the
+ * document.
+ *
+ * @param doc
+ * Root document
*/
- public DOMBuilder(Document doc)
- {
+ public DOMBuilder(Document doc) {
m_doc = doc;
}
/**
- * Get the root node of the DOM being created. This
- * is either a Document or a DocumentFragment.
- *
+ * Get the root node of the DOM being created. This is either a Document or a
+ * DocumentFragment.
+ *
* @return The root document or document fragment if not null
*/
- public Node getRootNode()
- {
+ public Node getRootNode() {
return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc;
}
/**
* Get the node currently being processed.
- *
+ *
* @return the current node being processed
*/
- public Node getCurrentNode()
- {
+ public Node getCurrentNode() {
return m_currentNode;
}
/**
* Return null since there is no Writer for this class.
- *
+ *
* @return null
*/
- public java.io.Writer getWriter()
- {
+ public java.io.Writer getWriter() {
return null;
}
/**
* Append a node to the current container.
- *
- * @param newNode New node to append
+ *
+ * @param newNode
+ * New node to append
*/
- protected void append(Node newNode) throws org.xml.sax.SAXException
- {
+ protected void append(Node newNode) throws org.xml.sax.SAXException {
Node currentNode = m_currentNode;
- if (null != currentNode)
- {
+ if (null != currentNode) {
currentNode.appendChild(newNode);
// System.out.println(newNode.getNodeName());
- }
- else if (null != m_docFrag)
- {
+ } else if (null != m_docFrag) {
m_docFrag.appendChild(newNode);
- }
- else
- {
+ } else {
boolean ok = true;
short type = newNode.getNodeType();
- if (type == Node.TEXT_NODE)
- {
+ if (type == Node.TEXT_NODE) {
String data = newNode.getNodeValue();
- if ((null != data) && (data.trim().length() > 0))
- {
- throw new org.xml.sax.SAXException("Warning: can't output text before document element! Ignoring...");
+ if ((null != data) && (data.trim().length() > 0)) {
+ throw new org.xml.sax.SAXException(
+ "Warning: can't output text before document element! Ignoring...");
}
ok = false;
- }
- else if (type == Node.ELEMENT_NODE)
- {
- if (m_doc.getDocumentElement() != null)
- {
- throw new org.xml.sax.SAXException("Can't have more than one root on a DOM!");
+ } else if (type == Node.ELEMENT_NODE) {
+ if (m_doc.getDocumentElement() != null) {
+ throw new org.xml.sax.SAXException(
+ "Can't have more than one root on a DOM!");
}
}
@@ -179,132 +168,139 @@ public class DOMBuilder
/**
* Receive an object for locating the origin of SAX document events.
- *
- * <p>SAX parsers are strongly encouraged (though not absolutely
- * required) to supply a locator: if it does so, it must supply
- * the locator to the application by invoking this method before
- * invoking any of the other methods in the ContentHandler
- * interface.</p>
- *
- * <p>The locator allows the application to determine the end
- * position of any document-related event, even if the parser is
- * not reporting an error. Typically, the application will
- * use this information for reporting its own errors (such as
- * character content that does not match an application's
- * business rules). The information returned by the locator
- * is probably not sufficient for use with a search engine.</p>
- *
- * <p>Note that the locator will return correct information only
- * during the invocation of the events in this interface. The
- * application should not attempt to use it at any other time.</p>
- *
- * @param locator An object that can return the location of
- * any SAX document event.
+ *
+ * <p>
+ * SAX parsers are strongly encouraged (though not absolutely required) to
+ * supply a locator: if it does so, it must supply the locator to the
+ * application by invoking this method before invoking any of the other
+ * methods in the ContentHandler interface.
+ * </p>
+ *
+ * <p>
+ * The locator allows the application to determine the end position of any
+ * document-related event, even if the parser is not reporting an error.
+ * Typically, the application will use this information for reporting its own
+ * errors (such as character content that does not match an application's
+ * business rules). The information returned by the locator is probably not
+ * sufficient for use with a search engine.
+ * </p>
+ *
+ * <p>
+ * Note that the locator will return correct information only during the
+ * invocation of the events in this interface. The application should not
+ * attempt to use it at any other time.
+ * </p>
+ *
+ * @param locator
+ * An object that can return the location of any SAX document event.
* @see org.xml.sax.Locator
*/
- public void setDocumentLocator(Locator locator)
- {
+ public void setDocumentLocator(Locator locator) {
// No action for the moment.
}
/**
* Receive notification of the beginning of a document.
- *
- * <p>The SAX parser will invoke this method only once, before any
- * other methods in this interface or in DTDHandler (except for
- * setDocumentLocator).</p>
+ *
+ * <p>
+ * The SAX parser will invoke this method only once, before any other methods
+ * in this interface or in DTDHandler (except for setDocumentLocator).
+ * </p>
*/
- public void startDocument() throws org.xml.sax.SAXException
- {
+ public void startDocument() throws org.xml.sax.SAXException {
// No action for the moment.
}
/**
* Receive notification of the end of a document.
- *
- * <p>The SAX parser will invoke this method only once, and it will
- * be the last method invoked during the parse. The parser shall
- * not invoke this method until it has either abandoned parsing
- * (because of an unrecoverable error) or reached the end of
- * input.</p>
+ *
+ * <p>
+ * The SAX parser will invoke this method only once, and it will be the last
+ * method invoked during the parse. The parser shall not invoke this method
+ * until it has either abandoned parsing (because of an unrecoverable error)
+ * or reached the end of input.
+ * </p>
*/
- public void endDocument() throws org.xml.sax.SAXException
- {
+ public void endDocument() throws org.xml.sax.SAXException {
// No action for the moment.
}
/**
* Receive notification of the beginning of an element.
- *
- * <p>The Parser will invoke this method at the beginning of every
- * element in the XML document; there will be a corresponding
- * endElement() event for every startElement() event (even when the
- * element is empty). All of the element's content will be
- * reported, in order, before the corresponding endElement()
- * event.</p>
- *
- * <p>If the element name has a namespace prefix, the prefix will
- * still be attached. Note that the attribute list provided will
- * contain only attributes with explicit values (specified or
- * defaulted): #IMPLIED attributes will be omitted.</p>
- *
- *
- * @param ns The namespace of the node
- * @param localName The local part of the qualified name
- * @param name The element name.
- * @param atts The attributes attached to the element, if any.
+ *
+ * <p>
+ * The Parser will invoke this method at the beginning of every element in the
+ * XML document; there will be a corresponding endElement() event for every
+ * startElement() event (even when the element is empty). All of the element's
+ * content will be reported, in order, before the corresponding endElement()
+ * event.
+ * </p>
+ *
+ * <p>
+ * If the element name has a namespace prefix, the prefix will still be
+ * attached. Note that the attribute list provided will contain only
+ * attributes with explicit values (specified or defaulted): #IMPLIED
+ * attributes will be omitted.
+ * </p>
+ *
+ *
+ * @param ns
+ * The namespace of the node
+ * @param localName
+ * The local part of the qualified name
+ * @param name
+ * The element name.
+ * @param atts
+ * The attributes attached to the element, if any.
* @see #endElement
* @see org.xml.sax.Attributes
*/
- public void startElement(
- String ns, String localName, String name, Attributes atts)
- throws org.xml.sax.SAXException
- {
+ public void startElement(String ns, String localName, String name,
+ Attributes atts) throws org.xml.sax.SAXException {
Element elem;
- // Note that the namespace-aware call must be used to correctly
- // construct a Level 2 DOM, even for non-namespaced nodes.
+ // Note that the namespace-aware call must be used to correctly
+ // construct a Level 2 DOM, even for non-namespaced nodes.
if ((null == ns) || (ns.length() == 0))
- elem = m_doc.createElementNS(null,name);
+ elem = m_doc.createElementNS(null, name);
else
elem = m_doc.createElementNS(ns, name);
append(elem);
- try
- {
+ try {
int nAtts = atts.getLength();
- if (0 != nAtts)
- {
- for (int i = 0; i < nAtts; i++)
- {
+ if (0 != nAtts) {
+ for (int i = 0; i < nAtts; i++) {
- //System.out.println("type " + atts.getType(i) + " name " + atts.getLocalName(i) );
+ // System.out.println("type " + atts.getType(i) + " name " +
+ // atts.getLocalName(i) );
// First handle a possible ID attribute
if (atts.getType(i).equalsIgnoreCase("ID"))
setIDAttribute(atts.getValue(i), elem);
String attrNS = atts.getURI(i);
- if("".equals(attrNS))
+ if ("".equals(attrNS))
attrNS = null; // DOM represents no-namespace as null
// System.out.println("attrNS: "+attrNS+", localName: "+atts.getQName(i)
- // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i));
+ // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i));
// Crimson won't let us set an xmlns: attribute on the DOM.
String attrQName = atts.getQName(i);
- // In SAX, xmlns: attributes have an empty namespace, while in DOM they should have the xmlns namespace
+ // In SAX, xmlns: attributes have an empty namespace, while in DOM
+ // they should have the xmlns namespace
if (attrQName.startsWith("xmlns:"))
attrNS = "http://www.w3.org/2000/xmlns/";
// ALWAYS use the DOM Level 2 call!
- elem.setAttributeNS(attrNS,attrQName, atts.getValue(i));
+ elem.setAttributeNS(attrNS, attrQName, atts.getValue(i));
}
}
@@ -315,9 +311,7 @@ public class DOMBuilder
m_currentNode = elem;
// append(elem);
- }
- catch(java.lang.Exception de)
- {
+ } catch (java.lang.Exception de) {
// de.printStackTrace();
throw new org.xml.sax.SAXException(de);
}
@@ -325,74 +319,87 @@ public class DOMBuilder
}
/**
-
-
-
+ *
+ *
+ *
* Receive notification of the end of an element.
- *
- * <p>The SAX parser will invoke this method at the end of every
- * element in the XML document; there will be a corresponding
- * startElement() event for every endElement() event (even when the
- * element is empty).</p>
- *
- * <p>If the element name has a namespace prefix, the prefix will
- * still be attached to the name.</p>
- *
- *
- * @param ns the namespace of the element
- * @param localName The local part of the qualified name of the element
- * @param name The element name
+ *
+ * <p>
+ * The SAX parser will invoke this method at the end of every element in the
+ * XML document; there will be a corresponding startElement() event for every
+ * endElement() event (even when the element is empty).
+ * </p>
+ *
+ * <p>
+ * If the element name has a namespace prefix, the prefix will still be
+ * attached to the name.
+ * </p>
+ *
+ *
+ * @param ns
+ * the namespace of the element
+ * @param localName
+ * The local part of the qualified name of the element
+ * @param name
+ * The element name
*/
public void endElement(String ns, String localName, String name)
- throws org.xml.sax.SAXException
- {
+ throws org.xml.sax.SAXException {
m_elemStack.pop();
m_currentNode = m_elemStack.isEmpty() ? null : m_elemStack.peek();
}
/**
* Set an ID string to node association in the ID table.
- *
- * @param id The ID string.
- * @param elem The associated ID.
+ *
+ * @param id
+ * The ID string.
+ * @param elem
+ * The associated ID.
*/
- public void setIDAttribute(String id, Element elem)
- {
+ public void setIDAttribute(String id, Element elem) {
// Do nothing. This method is meant to be overiden.
}
/**
* Receive notification of character data.
- *
- * <p>The Parser will call this method to report each chunk of
- * character data. SAX parsers may return all contiguous character
- * data in a single chunk, or they may split it into several
- * chunks; however, all of the characters in any single event
- * must come from the same external entity, so that the Locator
- * provides useful information.</p>
- *
- * <p>The application must not attempt to read from the array
- * outside of the specified range.</p>
- *
- * <p>Note that some parsers will report whitespace using the
- * ignorableWhitespace() method rather than this one (validating
- * parsers must do so).</p>
- *
- * @param ch The characters from the XML document.
- * @param start The start position in the array.
- * @param length The number of characters to read from the array.
+ *
+ * <p>
+ * The Parser will call this method to report each chunk of character data.
+ * SAX parsers may return all contiguous character data in a single chunk, or
+ * they may split it into several chunks; however, all of the characters in
+ * any single event must come from the same external entity, so that the
+ * Locator provides useful information.
+ * </p>
+ *
+ * <p>
+ * The application must not attempt to read from the array outside of the
+ * specified range.
+ * </p>
+ *
+ * <p>
+ * Note that some parsers will report whitespace using the
+ * ignorableWhitespace() method rather than this one (validating parsers must
+ * do so).
+ * </p>
+ *
+ * @param ch
+ * The characters from the XML document.
+ * @param start
+ * The start position in the array.
+ * @param length
+ * The number of characters to read from the array.
* @see #ignorableWhitespace
* @see org.xml.sax.Locator
*/
- public void characters(char ch[], int start, int length) throws org.xml.sax.SAXException
- {
- if(isOutsideDocElem()
- && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
- return; // avoid DOM006 Hierarchy request error
+ public void characters(char ch[], int start, int length)
+ throws org.xml.sax.SAXException {
+ if (isOutsideDocElem()
+ && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+ return; // avoid DOM006 Hierarchy request error
- if (m_inCData)
- {
+ if (m_inCData) {
cdata(ch, start, length);
return;
@@ -400,57 +407,55 @@ public class DOMBuilder
String s = new String(ch, start, length);
Node childNode;
- childNode = m_currentNode != null ? m_currentNode.getLastChild(): null;
- if( childNode != null && childNode.getNodeType() == Node.TEXT_NODE ){
- ((Text)childNode).appendData(s);
- }
- else{
- Text text = m_doc.createTextNode(s);
- append(text);
+ childNode = m_currentNode != null ? m_currentNode.getLastChild() : null;
+ if (childNode != null && childNode.getNodeType() == Node.TEXT_NODE) {
+ ((Text) childNode).appendData(s);
+ } else {
+ Text text = m_doc.createTextNode(s);
+ append(text);
}
}
/**
- * If available, when the disable-output-escaping attribute is used,
- * output raw text without escaping. A PI will be inserted in front
- * of the node with the name "lotusxsl-next-is-raw" and a value of
- * "formatter-to-dom".
- *
- * @param ch Array containing the characters
- * @param start Index to start of characters in the array
- * @param length Number of characters in the array
+ * If available, when the disable-output-escaping attribute is used, output
+ * raw text without escaping. A PI will be inserted in front of the node with
+ * the name "lotusxsl-next-is-raw" and a value of "formatter-to-dom".
+ *
+ * @param ch
+ * Array containing the characters
+ * @param start
+ * Index to start of characters in the array
+ * @param length
+ * Number of characters in the array
*/
public void charactersRaw(char ch[], int start, int length)
- throws org.xml.sax.SAXException
- {
- if(isOutsideDocElem()
- && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
- return; // avoid DOM006 Hierarchy request error
-
+ throws org.xml.sax.SAXException {
+ if (isOutsideDocElem()
+ && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+ return; // avoid DOM006 Hierarchy request error
String s = new String(ch, start, length);
append(m_doc.createProcessingInstruction("xslt-next-is-raw",
- "formatter-to-dom"));
+ "formatter-to-dom"));
append(m_doc.createTextNode(s));
}
/**
* Report the beginning of an entity.
- *
- * The start and end of the document entity are not reported.
- * The start and end of the external DTD subset are reported
- * using the pseudo-name "[dtd]". All other events must be
- * properly nested within start/end entity events.
- *
- * @param name The name of the entity. If it is a parameter
- * entity, the name will begin with '%'.
+ *
+ * The start and end of the document entity are not reported. The start and
+ * end of the external DTD subset are reported using the pseudo-name "[dtd]".
+ * All other events must be properly nested within start/end entity events.
+ *
+ * @param name
+ * The name of the entity. If it is a parameter entity, the name will
+ * begin with '%'.
* @see #endEntity
* @see org.xml.sax.ext.DeclHandler#internalEntityDecl
* @see org.xml.sax.ext.DeclHandler#externalEntityDecl
*/
- public void startEntity(String name) throws org.xml.sax.SAXException
- {
+ public void startEntity(String name) throws org.xml.sax.SAXException {
// Almost certainly the wrong behavior...
// entityReference(name);
@@ -458,49 +463,58 @@ public class DOMBuilder
/**
* Report the end of an entity.
- *
- * @param name The name of the entity that is ending.
+ *
+ * @param name
+ * The name of the entity that is ending.
* @see #startEntity
*/
- public void endEntity(String name) throws org.xml.sax.SAXException{}
+ public void endEntity(String name) throws org.xml.sax.SAXException {
+ }
/**
* Receive notivication of a entityReference.
- *
- * @param name name of the entity reference
+ *
+ * @param name
+ * name of the entity reference
*/
- public void entityReference(String name) throws org.xml.sax.SAXException
- {
+ public void entityReference(String name) throws org.xml.sax.SAXException {
append(m_doc.createEntityReference(name));
}
/**
* Receive notification of ignorable whitespace in element content.
- *
- * <p>Validating Parsers must use this method to report each chunk
- * of ignorable whitespace (see the W3C XML 1.0 recommendation,
- * section 2.10): non-validating parsers may also use this method
- * if they are capable of parsing and using content models.</p>
- *
- * <p>SAX parsers may return all contiguous whitespace in a single
- * chunk, or they may split it into several chunks; however, all of
- * the characters in any single event must come from the same
- * external entity, so that the Locator provides useful
- * information.</p>
- *
- * <p>The application must not attempt to read from the array
- * outside of the specified range.</p>
- *
- * @param ch The characters from the XML document.
- * @param start The start position in the array.
- * @param length The number of characters to read from the array.
+ *
+ * <p>
+ * Validating Parsers must use this method to report each chunk of ignorable
+ * whitespace (see the W3C XML 1.0 recommendation, section 2.10):
+ * non-validating parsers may also use this method if they are capable of
+ * parsing and using content models.
+ * </p>
+ *
+ * <p>
+ * SAX parsers may return all contiguous whitespace in a single chunk, or they
+ * may split it into several chunks; however, all of the characters in any
+ * single event must come from the same external entity, so that the Locator
+ * provides useful information.
+ * </p>
+ *
+ * <p>
+ * The application must not attempt to read from the array outside of the
+ * specified range.
+ * </p>
+ *
+ * @param ch
+ * The characters from the XML document.
+ * @param start
+ * The start position in the array.
+ * @param length
+ * The number of characters to read from the array.
* @see #characters
*/
public void ignorableWhitespace(char ch[], int start, int length)
- throws org.xml.sax.SAXException
- {
- if(isOutsideDocElem())
- return; // avoid DOM006 Hierarchy request error
+ throws org.xml.sax.SAXException {
+ if (isOutsideDocElem())
+ return; // avoid DOM006 Hierarchy request error
String s = new String(ch, start, length);
@@ -509,232 +523,244 @@ public class DOMBuilder
/**
* Tell if the current node is outside the document element.
- *
+ *
* @return true if the current node is outside the document element.
*/
- private boolean isOutsideDocElem()
- {
- return (null == m_docFrag) && m_elemStack.size() == 0 && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE);
- }
+ private boolean isOutsideDocElem() {
+ return (null == m_docFrag)
+ && m_elemStack.size() == 0
+ && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE);
+ }
/**
* Receive notification of a processing instruction.
- *
- * <p>The Parser will invoke this method once for each processing
- * instruction found: note that processing instructions may occur
- * before or after the main document element.</p>
- *
- * <p>A SAX parser should never report an XML declaration (XML 1.0,
- * section 2.8) or a text declaration (XML 1.0, section 4.3.1)
- * using this method.</p>
- *
- * @param target The processing instruction target.
- * @param data The processing instruction data, or null if
- * none was supplied.
+ *
+ * <p>
+ * The Parser will invoke this method once for each processing instruction
+ * found: note that processing instructions may occur before or after the main
+ * document element.
+ * </p>
+ *
+ * <p>
+ * A SAX parser should never report an XML declaration (XML 1.0, section 2.8)
+ * or a text declaration (XML 1.0, section 4.3.1) using this method.
+ * </p>
+ *
+ * @param target
+ * The processing instruction target.
+ * @param data
+ * The processing instruction data, or null if none was supplied.
*/
public void processingInstruction(String target, String data)
- throws org.xml.sax.SAXException
- {
+ throws org.xml.sax.SAXException {
append(m_doc.createProcessingInstruction(target, data));
}
/**
* Report an XML comment anywhere in the document.
- *
- * This callback will be used for comments inside or outside the
- * document element, including comments in the external DTD
- * subset (if read).
- *
- * @param ch An array holding the characters in the comment.
- * @param start The starting position in the array.
- * @param length The number of characters to use from the array.
+ *
+ * This callback will be used for comments inside or outside the document
+ * element, including comments in the external DTD subset (if read).
+ *
+ * @param ch
+ * An array holding the characters in the comment.
+ * @param start
+ * The starting position in the array.
+ * @param length
+ * The number of characters to use from the array.
*/
- public void comment(char ch[], int start, int length) throws org.xml.sax.SAXException
- {
+ public void comment(char ch[], int start, int length)
+ throws org.xml.sax.SAXException {
// tagsoup sometimes submits invalid values here
- if (ch == null || start < 0 || length >= (ch.length - start) || length < 0) return;
+ if (ch == null || start < 0 || length >= (ch.length - start) || length < 0)
+ return;
append(m_doc.createComment(new String(ch, start, length)));
}
- /** Flag indicating that we are processing a CData section */
+ /** Flag indicating that we are processing a CData section */
protected boolean m_inCData = false;
/**
* Report the start of a CDATA section.
- *
+ *
* @see #endCDATA
*/
- public void startCDATA() throws org.xml.sax.SAXException
- {
+ public void startCDATA() throws org.xml.sax.SAXException {
m_inCData = true;
append(m_doc.createCDATASection(""));
}
/**
* Report the end of a CDATA section.
- *
+ *
* @see #startCDATA
*/
- public void endCDATA() throws org.xml.sax.SAXException
- {
+ public void endCDATA() throws org.xml.sax.SAXException {
m_inCData = false;
}
/**
* Receive notification of cdata.
- *
- * <p>The Parser will call this method to report each chunk of
- * character data. SAX parsers may return all contiguous character
- * data in a single chunk, or they may split it into several
- * chunks; however, all of the characters in any single event
- * must come from the same external entity, so that the Locator
- * provides useful information.</p>
- *
- * <p>The application must not attempt to read from the array
- * outside of the specified range.</p>
- *
- * <p>Note that some parsers will report whitespace using the
- * ignorableWhitespace() method rather than this one (validating
- * parsers must do so).</p>
- *
- * @param ch The characters from the XML document.
- * @param start The start position in the array.
- * @param length The number of characters to read from the array.
+ *
+ * <p>
+ * The Parser will call this method to report each chunk of character data.
+ * SAX parsers may return all contiguous character data in a single chunk, or
+ * they may split it into several chunks; however, all of the characters in
+ * any single event must come from the same external entity, so that the
+ * Locator provides useful information.
+ * </p>
+ *
+ * <p>
+ * The application must not attempt to read from the array outside of the
+ * specified range.
+ * </p>
+ *
+ * <p>
+ * Note that some parsers will report whitespace using the
+ * ignorableWhitespace() method rather than this one (validating parsers must
+ * do so).
+ * </p>
+ *
+ * @param ch
+ * The characters from the XML document.
+ * @param start
+ * The start position in the array.
+ * @param length
+ * The number of characters to read from the array.
* @see #ignorableWhitespace
* @see org.xml.sax.Locator
*/
- public void cdata(char ch[], int start, int length) throws org.xml.sax.SAXException
- {
- if(isOutsideDocElem()
- && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
- return; // avoid DOM006 Hierarchy request error
+ public void cdata(char ch[], int start, int length)
+ throws org.xml.sax.SAXException {
+ if (isOutsideDocElem()
+ && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+ return; // avoid DOM006 Hierarchy request error
String s = new String(ch, start, length);
- // XXX ab@apache.org: modified from the original, to accomodate TagSoup.
+ // XXX ab@apache.org: modified from the original, to accomodate TagSoup.
Node n = m_currentNode.getLastChild();
if (n instanceof CDATASection)
- ((CDATASection)n).appendData(s);
+ ((CDATASection) n).appendData(s);
else if (n instanceof Comment)
- ((Comment)n).appendData(s);
+ ((Comment) n).appendData(s);
}
/**
* Report the start of DTD declarations, if any.
- *
- * Any declarations are assumed to be in the internal subset
- * unless otherwise indicated.
- *
- * @param name The document type name.
- * @param publicId The declared public identifier for the
- * external DTD subset, or null if none was declared.
- * @param systemId The declared system identifier for the
- * external DTD subset, or null if none was declared.
+ *
+ * Any declarations are assumed to be in the internal subset unless otherwise
+ * indicated.
+ *
+ * @param name
+ * The document type name.
+ * @param publicId
+ * The declared public identifier for the external DTD subset, or
+ * null if none was declared.
+ * @param systemId
+ * The declared system identifier for the external DTD subset, or
+ * null if none was declared.
* @see #endDTD
* @see #startEntity
*/
public void startDTD(String name, String publicId, String systemId)
- throws org.xml.sax.SAXException
- {
+ throws org.xml.sax.SAXException {
// Do nothing for now.
}
/**
* Report the end of DTD declarations.
- *
+ *
* @see #startDTD
*/
- public void endDTD() throws org.xml.sax.SAXException
- {
+ public void endDTD() throws org.xml.sax.SAXException {
// Do nothing for now.
}
/**
* Begin the scope of a prefix-URI Namespace mapping.
- *
- * <p>The information from this event is not necessary for
- * normal Namespace processing: the SAX XML reader will
- * automatically replace prefixes for element and attribute
- * names when the http://xml.org/sax/features/namespaces
- * feature is true (the default).</p>
- *
- * <p>There are cases, however, when applications need to
- * use prefixes in character data or in attribute values,
- * where they cannot safely be expanded automatically; the
- * start/endPrefixMapping event supplies the information
- * to the application to expand prefixes in those contexts
- * itself, if necessary.</p>
- *
- * <p>Note that start/endPrefixMapping events are not
- * guaranteed to be properly nested relative to each-other:
- * all startPrefixMapping events will occur before the
- * corresponding startElement event, and all endPrefixMapping
- * events will occur after the corresponding endElement event,
- * but their order is not guaranteed.</p>
- *
- * @param prefix The Namespace prefix being declared.
- * @param uri The Namespace URI the prefix is mapped to.
+ *
+ * <p>
+ * The information from this event is not necessary for normal Namespace
+ * processing: the SAX XML reader will automatically replace prefixes for
+ * element and attribute names when the http://xml.org/sax/features/namespaces
+ * feature is true (the default).
+ * </p>
+ *
+ * <p>
+ * There are cases, however, when applications need to use prefixes in
+ * character data or in attribute values, where they cannot safely be expanded
+ * automatically; the start/endPrefixMapping event supplies the information to
+ * the application to expand prefixes in those contexts itself, if necessary.
+ * </p>
+ *
+ * <p>
+ * Note that start/endPrefixMapping events are not guaranteed to be properly
+ * nested relative to each-other: all startPrefixMapping events will occur
+ * before the corresponding startElement event, and all endPrefixMapping
+ * events will occur after the corresponding endElement event, but their order
+ * is not guaranteed.
+ * </p>
+ *
+ * @param prefix
+ * The Namespace prefix being declared.
+ * @param uri
+ * The Namespace URI the prefix is mapped to.
* @see #endPrefixMapping
* @see #startElement
*/
public void startPrefixMapping(String prefix, String uri)
- throws org.xml.sax.SAXException
- {
+ throws org.xml.sax.SAXException {
/*
- // Not sure if this is needed or wanted
- // Also, it fails in the stree.
- if((null != m_currentNode)
- && (m_currentNode.getNodeType() == Node.ELEMENT_NODE))
- {
- String qname;
- if(((null != prefix) && (prefix.length() == 0))
- || (null == prefix))
- qname = "xmlns";
- else
- qname = "xmlns:"+prefix;
-
- Element elem = (Element)m_currentNode;
- String val = elem.getAttribute(qname); // Obsolete, should be DOM2...?
- if(val == null)
- {
- elem.setAttributeNS("http://www.w3.org/XML/1998/namespace",
- qname, uri);
- }
- }
- */
+ * // Not sure if this is needed or wanted // Also, it fails in the stree.
+ * if((null != m_currentNode) && (m_currentNode.getNodeType() ==
+ * Node.ELEMENT_NODE)) { String qname; if(((null != prefix) &&
+ * (prefix.length() == 0)) || (null == prefix)) qname = "xmlns"; else qname
+ * = "xmlns:"+prefix;
+ *
+ * Element elem = (Element)m_currentNode; String val =
+ * elem.getAttribute(qname); // Obsolete, should be DOM2...? if(val == null)
+ * { elem.setAttributeNS("http://www.w3.org/XML/1998/namespace", qname,
+ * uri); } }
+ */
}
/**
* End the scope of a prefix-URI mapping.
- *
- * <p>See startPrefixMapping for details. This event will
- * always occur after the corresponding endElement event,
- * but the order of endPrefixMapping events is not otherwise
- * guaranteed.</p>
- *
- * @param prefix The prefix that was being mapping.
+ *
+ * <p>
+ * See startPrefixMapping for details. This event will always occur after the
+ * corresponding endElement event, but the order of endPrefixMapping events is
+ * not otherwise guaranteed.
+ * </p>
+ *
+ * @param prefix
+ * The prefix that was being mapping.
* @see #startPrefixMapping
* @see #endElement
*/
- public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException{}
+ public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException {
+ }
/**
* Receive notification of a skipped entity.
- *
- * <p>The Parser will invoke this method once for each entity
- * skipped. Non-validating processors may skip entities if they
- * have not seen the declarations (because, for example, the
- * entity was declared in an external DTD subset). All processors
- * may skip external entities, depending on the values of the
- * http://xml.org/sax/features/external-general-entities and the
- * http://xml.org/sax/features/external-parameter-entities
- * properties.</p>
- *
- * @param name The name of the skipped entity. If it is a
- * parameter entity, the name will begin with '%'.
+ *
+ * <p>
+ * The Parser will invoke this method once for each entity skipped.
+ * Non-validating processors may skip entities if they have not seen the
+ * declarations (because, for example, the entity was declared in an external
+ * DTD subset). All processors may skip external entities, depending on the
+ * values of the http://xml.org/sax/features/external-general-entities and the
+ * http://xml.org/sax/features/external-parameter-entities properties.
+ * </p>
+ *
+ * @param name
+ * The name of the skipped entity. If it is a parameter entity, the
+ * name will begin with '%'.
*/
- public void skippedEntity(String name) throws org.xml.sax.SAXException{}
+ public void skippedEntity(String name) throws org.xml.sax.SAXException {
+ }
}
Modified: nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Fri Jan 9 06:34:33 2015
@@ -33,34 +33,34 @@ import org.w3c.dom.*;
/**
* A collection of methods for extracting content from DOM trees.
*
- * This class holds a few utility methods for pulling content out of
- * DOM nodes, such as getOutlinks, getText, etc.
- *
+ * This class holds a few utility methods for pulling content out of DOM nodes,
+ * such as getOutlinks, getText, etc.
+ *
*/
public class DOMContentUtils {
public static class LinkParams {
public String elName;
public String attrName;
- public int childLen;
-
- public LinkParams(String elName, String attrName, int childLen) {
- this.elName = elName;
- this.attrName = attrName;
- this.childLen = childLen;
- }
-
- public String toString() {
- return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]";
- }
+ public int childLen;
+
+ public LinkParams(String elName, String attrName, int childLen) {
+ this.elName = elName;
+ this.attrName = attrName;
+ this.childLen = childLen;
+ }
+
+ public String toString() {
+ return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]";
+ }
}
-
+
private HashMap<String, LinkParams> linkParams = new HashMap<String, LinkParams>();
-
+
public DOMContentUtils(Configuration conf) {
setConf(conf);
}
-
+
public void setConf(Configuration conf) {
// forceTags is used to override configurable tag ignoring, later on
Collection<String> forceTags = new ArrayList<String>(1);
@@ -81,59 +81,57 @@ public class DOMContentUtils {
// remove unwanted link tags from the linkParams map
String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags");
- for ( int i = 0 ; ignoreTags != null && i < ignoreTags.length ; i++ ) {
- if ( ! forceTags.contains(ignoreTags[i]) )
+ for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) {
+ if (!forceTags.contains(ignoreTags[i]))
linkParams.remove(ignoreTags[i]);
}
}
-
+
/**
- * This method takes a {@link StringBuilder} and a DOM {@link Node},
- * and will append all the content text found beneath the DOM node to
- * the <code>StringBuilder</code>.
- *
+ * This method takes a {@link StringBuilder} and a DOM {@link Node}, and will
+ * append all the content text found beneath the DOM node to the
+ * <code>StringBuilder</code>.
+ *
* <p>
- *
- * If <code>abortOnNestedAnchors</code> is true, DOM traversal will
- * be aborted and the <code>StringBuffer</code> will not contain
- * any text encountered after a nested anchor is found.
+ *
+ * If <code>abortOnNestedAnchors</code> is true, DOM traversal will be aborted
+ * and the <code>StringBuffer</code> will not contain any text encountered
+ * after a nested anchor is found.
*
* <p>
- *
+ *
* @return true if nested anchors were found
*/
- public boolean getText(StringBuilder sb, Node node,
- boolean abortOnNestedAnchors) {
+ public boolean getText(StringBuilder sb, Node node,
+ boolean abortOnNestedAnchors) {
if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
return true;
- }
+ }
return false;
}
-
/**
- * This is a convinience method, equivalent to {@link
- * #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
+ * This is a convinience method, equivalent to
+ * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
*
*/
public void getText(StringBuilder sb, Node node) {
getText(sb, node, false);
}
- // returns true if abortOnNestedAnchors is true and we find nested
+ // returns true if abortOnNestedAnchors is true and we find nested
// anchors
- private boolean getTextHelper(StringBuilder sb, Node node,
- boolean abortOnNestedAnchors,
- int anchorDepth) {
+ private boolean getTextHelper(StringBuilder sb, Node node,
+ boolean abortOnNestedAnchors, int anchorDepth) {
boolean abort = false;
NodeWalker walker = new NodeWalker(node);
-
+
while (walker.hasNext()) {
-
+
Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
-
+
if ("script".equalsIgnoreCase(nodeName)) {
walker.skipChildren();
}
@@ -145,7 +143,7 @@ public class DOMContentUtils {
if (anchorDepth > 1) {
abort = true;
break;
- }
+ }
}
if (nodeType == Node.COMMENT_NODE) {
walker.skipChildren();
@@ -156,44 +154,45 @@ public class DOMContentUtils {
text = text.replaceAll("\\s+", " ");
text = text.trim();
if (text.length() > 0) {
- if (sb.length() > 0) sb.append(' ');
- sb.append(text);
+ if (sb.length() > 0)
+ sb.append(' ');
+ sb.append(text);
}
}
}
-
+
return abort;
}
/**
- * This method takes a {@link StringBuffer} and a DOM {@link Node},
- * and will append the content text found beneath the first
- * <code>title</code> node to the <code>StringBuffer</code>.
- *
+ * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
+ * append the content text found beneath the first <code>title</code> node to
+ * the <code>StringBuffer</code>.
+ *
* @return true if a title node was found, false otherwise
*/
public boolean getTitle(StringBuilder sb, Node node) {
-
+
NodeWalker walker = new NodeWalker(node);
-
+
while (walker.hasNext()) {
-
+
Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
-
+
if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
return false;
}
-
+
if (nodeType == Node.ELEMENT_NODE) {
if ("title".equalsIgnoreCase(nodeName)) {
getText(sb, currentNode);
return true;
}
}
- }
-
+ }
+
return false;
}
@@ -201,28 +200,29 @@ public class DOMContentUtils {
public URL getBase(Node node) {
NodeWalker walker = new NodeWalker(node);
-
+
while (walker.hasNext()) {
-
+
Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
-
+
// is this node a BASE tag?
if (nodeType == Node.ELEMENT_NODE) {
-
+
if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
return null;
}
-
+
if ("base".equalsIgnoreCase(nodeName)) {
NamedNodeMap attrs = currentNode.getAttributes();
- for (int i= 0; i < attrs.getLength(); i++ ) {
+ for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
if ("href".equalsIgnoreCase(attr.getNodeName())) {
try {
return new URL(attr.getNodeValue());
- } catch (MalformedURLException e) {}
+ } catch (MalformedURLException e) {
+ }
}
}
}
@@ -233,10 +233,9 @@ public class DOMContentUtils {
return null;
}
-
private boolean hasOnlyWhiteSpace(Node node) {
- String val= node.getNodeValue();
- for (int i= 0; i < val.length(); i++) {
+ String val = node.getNodeValue();
+ for (int i = 0; i < val.length(); i++) {
if (!Character.isWhitespace(val.charAt(i)))
return false;
}
@@ -245,50 +244,49 @@ public class DOMContentUtils {
// this only covers a few cases of empty links that are symptomatic
// of nekohtml's DOM-fixup process...
- private boolean shouldThrowAwayLink(Node node, NodeList children,
- int childLen, LinkParams params) {
+ private boolean shouldThrowAwayLink(Node node, NodeList children,
+ int childLen, LinkParams params) {
if (childLen == 0) {
- // this has no inner structure
- if (params.childLen == 0) return false;
- else return true;
- } else if ((childLen == 1)
- && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
- && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
+ // this has no inner structure
+ if (params.childLen == 0)
+ return false;
+ else
+ return true;
+ } else if ((childLen == 1)
+ && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
+ && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
// single nested link
return true;
} else if (childLen == 2) {
- Node c0= children.item(0);
- Node c1= children.item(1);
+ Node c0 = children.item(0);
+ Node c1 = children.item(1);
if ((c0.getNodeType() == Node.ELEMENT_NODE)
&& (params.elName.equalsIgnoreCase(c0.getNodeName()))
- && (c1.getNodeType() == Node.TEXT_NODE)
- && hasOnlyWhiteSpace(c1) ) {
+ && (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) {
// single link followed by whitespace node
return true;
}
if ((c1.getNodeType() == Node.ELEMENT_NODE)
&& (params.elName.equalsIgnoreCase(c1.getNodeName()))
- && (c0.getNodeType() == Node.TEXT_NODE)
- && hasOnlyWhiteSpace(c0) ) {
+ && (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) {
// whitespace node followed by single link
return true;
}
} else if (childLen == 3) {
- Node c0= children.item(0);
- Node c1= children.item(1);
- Node c2= children.item(2);
-
+ Node c0 = children.item(0);
+ Node c1 = children.item(1);
+ Node c2 = children.item(2);
+
if ((c1.getNodeType() == Node.ELEMENT_NODE)
&& (params.elName.equalsIgnoreCase(c1.getNodeName()))
- && (c0.getNodeType() == Node.TEXT_NODE)
- && (c2.getNodeType() == Node.TEXT_NODE)
- && hasOnlyWhiteSpace(c0)
- && hasOnlyWhiteSpace(c2) ) {
+ && (c0.getNodeType() == Node.TEXT_NODE)
+ && (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)
+ && hasOnlyWhiteSpace(c2)) {
// single link surrounded by whitespace nodes
return true;
}
@@ -296,57 +294,54 @@ public class DOMContentUtils {
return false;
}
-
+
/**
- * This method finds all anchors below the supplied DOM
- * <code>node</code>, and creates appropriate {@link Outlink}
- * records for each (relative to the supplied <code>base</code>
- * URL), and adds them to the <code>outlinks</code> {@link
- * ArrayList}.
- *
+ * This method finds all anchors below the supplied DOM <code>node</code>, and
+ * creates appropriate {@link Outlink} records for each (relative to the
+ * supplied <code>base</code> URL), and adds them to the <code>outlinks</code>
+ * {@link ArrayList}.
+ *
* <p>
- *
- * Links without inner structure (tags, text, etc) are discarded, as
- * are links which contain only single nested links and empty text
- * nodes (this is a common DOM-fixup artifact, at least with
- * nekohtml).
+ *
+ * Links without inner structure (tags, text, etc) are discarded, as are links
+ * which contain only single nested links and empty text nodes (this is a
+ * common DOM-fixup artifact, at least with nekohtml).
*/
- public void getOutlinks(URL base, ArrayList<Outlink> outlinks,
- Node node) {
-
+ public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
+
NodeWalker walker = new NodeWalker(node);
while (walker.hasNext()) {
-
+
Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
- short nodeType = currentNode.getNodeType();
+ short nodeType = currentNode.getNodeType();
NodeList children = currentNode.getChildNodes();
- int childLen = (children != null) ? children.getLength() : 0;
-
+ int childLen = (children != null) ? children.getLength() : 0;
+
if (nodeType == Node.ELEMENT_NODE) {
-
+
nodeName = nodeName.toLowerCase();
LinkParams params = linkParams.get(nodeName);
if (params != null) {
if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
-
+
StringBuilder linkText = new StringBuilder();
getText(linkText, currentNode, true);
-
+
NamedNodeMap attrs = currentNode.getAttributes();
String target = null;
boolean noFollow = false;
boolean post = false;
- for (int i= 0; i < attrs.getLength(); i++ ) {
+ for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
String attrName = attr.getNodeName();
if (params.attrName.equalsIgnoreCase(attrName)) {
target = attr.getNodeValue();
- } else if ("rel".equalsIgnoreCase(attrName) &&
- "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+ } else if ("rel".equalsIgnoreCase(attrName)
+ && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
noFollow = true;
- } else if ("method".equalsIgnoreCase(attrName) &&
- "post".equalsIgnoreCase(attr.getNodeValue())) {
+ } else if ("method".equalsIgnoreCase(attrName)
+ && "post".equalsIgnoreCase(attr.getNodeValue())) {
post = true;
}
}
@@ -354,18 +349,18 @@ public class DOMContentUtils {
try {
URL url = URLUtil.resolveURL(base, target);
- outlinks.add(new Outlink(url.toString(),
- linkText.toString().trim()));
+ outlinks.add(new Outlink(url.toString(), linkText.toString()
+ .trim()));
} catch (MalformedURLException e) {
// don't care
}
}
// this should not have any children, skip them
- if (params.childLen == 0) continue;
+ if (params.childLen == 0)
+ continue;
}
}
}
}
}
-
Modified: nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java (original)
+++ nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java Fri Jan 9 06:34:33 2015
@@ -23,32 +23,31 @@ import org.apache.nutch.parse.HTMLMetaTa
import org.w3c.dom.*;
/**
- * Class for parsing META Directives from DOM trees. This class
- * handles specifically Robots META directives (all, none, nofollow,
- * noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache
- * instructions. All meta directives are stored in a HTMLMetaTags instance.
+ * Class for parsing META Directives from DOM trees. This class handles
+ * specifically Robots META directives (all, none, nofollow, noindex), finding
+ * BASE HREF tags, and HTTP-EQUIV no-cache instructions. All meta directives are
+ * stored in a HTMLMetaTags instance.
*/
public class HTMLMetaProcessor {
/**
- * Utility class with indicators for the robots directives "noindex"
- * and "nofollow", and HTTP-EQUIV/no-cache
+ * Utility class with indicators for the robots directives "noindex" and
+ * "nofollow", and HTTP-EQUIV/no-cache
*/
-
+
/**
- * Sets the indicators in <code>robotsMeta</code> to appropriate
- * values, based on any META tags found under the given
- * <code>node</code>.
+ * Sets the indicators in <code>robotsMeta</code> to appropriate values, based
+ * on any META tags found under the given <code>node</code>.
*/
- public static final void getMetaTags (
- HTMLMetaTags metaTags, Node node, URL currURL) {
+ public static final void getMetaTags(HTMLMetaTags metaTags, Node node,
+ URL currURL) {
metaTags.reset();
getMetaTagsHelper(metaTags, node, currURL);
}
- private static final void getMetaTagsHelper(
- HTMLMetaTags metaTags, Node node, URL currURL) {
+ private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node,
+ URL currURL) {
if (node.getNodeType() == Node.ELEMENT_NODE) {
@@ -63,7 +62,7 @@ public class HTMLMetaProcessor {
Node equivNode = null;
Node contentNode = null;
// Retrieves name, http-equiv and content attribues
- for (int i=0; i<attrs.getLength(); i++) {
+ for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
String attrName = attr.getNodeName().toLowerCase();
if (attrName.equals("name")) {
@@ -74,44 +73,43 @@ public class HTMLMetaProcessor {
contentNode = attr;
}
}
-
+
if (nameNode != null) {
if (contentNode != null) {
String name = nameNode.getNodeValue().toLowerCase();
metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
if ("robots".equals(name)) {
-
+
if (contentNode != null) {
- String directives =
- contentNode.getNodeValue().toLowerCase();
+ String directives = contentNode.getNodeValue().toLowerCase();
int index = directives.indexOf("none");
-
+
if (index >= 0) {
metaTags.setNoIndex();
metaTags.setNoFollow();
}
-
+
index = directives.indexOf("all");
if (index >= 0) {
// do nothing...
}
-
+
index = directives.indexOf("noindex");
if (index >= 0) {
metaTags.setNoIndex();
}
-
+
index = directives.indexOf("nofollow");
if (index >= 0) {
metaTags.setNoFollow();
}
-
+
index = directives.indexOf("noarchive");
if (index >= 0) {
metaTags.setNoCache();
}
- }
-
+ }
+
} // end if (name == robots)
}
}
@@ -124,14 +122,15 @@ public class HTMLMetaProcessor {
if ("pragma".equals(name)) {
content = content.toLowerCase();
int index = content.indexOf("no-cache");
- if (index >= 0)
+ if (index >= 0)
metaTags.setNoCache();
} else if ("refresh".equals(name)) {
int idx = content.indexOf(';');
String time = null;
if (idx == -1) { // just the refresh time
time = content;
- } else time = content.substring(0, idx);
+ } else
+ time = content.substring(0, idx);
try {
metaTags.setRefreshTime(Integer.parseInt(time));
// skip this if we couldn't parse the time
@@ -142,9 +141,11 @@ public class HTMLMetaProcessor {
URL refreshUrl = null;
if (metaTags.getRefresh() && idx != -1) { // set the URL
idx = content.toLowerCase().indexOf("url=");
- if (idx == -1) { // assume a mis-formatted entry with just the url
+ if (idx == -1) { // assume a mis-formatted entry with just the
+ // url
idx = content.indexOf(';') + 1;
- } else idx += 4;
+ } else
+ idx += 4;
if (idx != -1) {
String url = content.substring(idx);
try {
@@ -187,13 +188,13 @@ public class HTMLMetaProcessor {
try {
if (currURL == null)
url = new URL(urlString);
- else
+ else
url = new URL(currURL, urlString);
} catch (Exception e) {
;
}
- if (url != null)
+ if (url != null)
metaTags.setBaseHref(url);
}
Modified: nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Fri Jan 9 06:34:33 2015
@@ -61,23 +61,23 @@ import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
public class HtmlParser implements Parser {
- public static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.parse.html");
+ public static final Logger LOG = LoggerFactory
+ .getLogger("org.apache.nutch.parse.html");
- // I used 1000 bytes at first, but found that some documents have
+ // I used 1000 bytes at first, but found that some documents have
// meta tag well past the first 1000 bytes.
// (e.g. http://cn.promo.yahoo.com/customcare/music.html)
private static final int CHUNK_SIZE = 2000;
// NUTCH-1006 Meta equiv with single quotes not accepted
- private static Pattern metaPattern =
- Pattern.compile("<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>",
- Pattern.CASE_INSENSITIVE);
- private static Pattern charsetPattern =
- Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)",
- Pattern.CASE_INSENSITIVE);
- private static Pattern charsetPatternHTML5 =
- Pattern.compile("<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>",
- Pattern.CASE_INSENSITIVE);
+ private static Pattern metaPattern = Pattern.compile(
+ "<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>",
+ Pattern.CASE_INSENSITIVE);
+ private static Pattern charsetPattern = Pattern.compile(
+ "charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE);
+ private static Pattern charsetPatternHTML5 = Pattern.compile(
+ "<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>",
+ Pattern.CASE_INSENSITIVE);
private static Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
@@ -89,19 +89,19 @@ public class HtmlParser implements Parse
/**
* Given a <code>ByteBuffer</code> representing an html file of an
- * <em>unknown</em> encoding, read out 'charset' parameter in the meta tag
- * from the first <code>CHUNK_SIZE</code> bytes.
- * If there's no meta tag for Content-Type or no charset is specified,
- * the content is checked for a Unicode Byte Order Mark (BOM).
- * This will also cover non-byte oriented character encodings (UTF-16 only).
- * If no character set can be determined,
- * <code>null</code> is returned. <br />
- * See also http://www.w3.org/International/questions/qa-html-encoding-declarations,
+ * <em>unknown</em> encoding, read out 'charset' parameter in the meta tag
+ * from the first <code>CHUNK_SIZE</code> bytes. If there's no meta tag for
+ * Content-Type or no charset is specified, the content is checked for a
+ * Unicode Byte Order Mark (BOM). This will also cover non-byte oriented
+ * character encodings (UTF-16 only). If no character set can be determined,
+ * <code>null</code> is returned. <br />
+ * See also
+ * http://www.w3.org/International/questions/qa-html-encoding-declarations,
* http://www.w3.org/TR/2011/WD-html5-diff-20110405/#character-encoding, and
- * http://www.w3.org/TR/REC-xml/#sec-guessing
- * <br />
- *
- * @param content <code>ByteBuffer</code> representation of an html file
+ * http://www.w3.org/TR/REC-xml/#sec-guessing <br />
+ *
+ * @param content
+ * <code>ByteBuffer</code> representation of an html file
*/
private static String sniffCharacterEncoding(ByteBuffer content) {
@@ -113,8 +113,8 @@ public class HtmlParser implements Parse
// {U+0041, U+0082, U+00B7}.
String str = "";
try {
- str = new String(content.array(), content.arrayOffset() + content.position(),
- length, Charset.forName("ASCII").toString());
+ str = new String(content.array(), content.arrayOffset()
+ + content.position(), length, Charset.forName("ASCII").toString());
} catch (UnsupportedEncodingException e) {
// code should never come here, but just in case...
return null;
@@ -136,17 +136,14 @@ public class HtmlParser implements Parse
}
if (encoding == null) {
// check for BOM
- if (length >= 3
- && content.get(0) == (byte) 0xEF
- && content.get(1) == (byte) 0xBB
- && content.get(2) == (byte) 0xBF) {
+ if (length >= 3 && content.get(0) == (byte) 0xEF
+ && content.get(1) == (byte) 0xBB && content.get(2) == (byte) 0xBF) {
encoding = "UTF-8";
} else if (length >= 2) {
- if (content.get(0) == (byte)0xFF
- && content.get(1) == (byte)0xFE) {
+ if (content.get(0) == (byte) 0xFF && content.get(1) == (byte) 0xFE) {
encoding = "UTF-16LE";
- } else if (content.get(0) == (byte)0xFE
- && content.get(1) == (byte)0xFF) {
+ } else if (content.get(0) == (byte) 0xFE
+ && content.get(1) == (byte) 0xFF) {
encoding = "UTF-16BE";
}
}
@@ -184,19 +181,24 @@ public class HtmlParser implements Parse
DocumentFragment root;
try {
ByteBuffer contentInOctets = page.getContent();
- InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets.array(),
- contentInOctets.arrayOffset() + contentInOctets.position(), contentInOctets.remaining()));
+ InputSource input = new InputSource(new ByteArrayInputStream(
+ contentInOctets.array(), contentInOctets.arrayOffset()
+ + contentInOctets.position(), contentInOctets.remaining()));
EncodingDetector detector = new EncodingDetector(conf);
detector.autoDetectClues(page, true);
detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
String encoding = detector.guessEncoding(page, defaultCharEncoding);
- page.getMetadata().put(new Utf8(Metadata.ORIGINAL_CHAR_ENCODING), ByteBuffer.wrap(Bytes.toBytes(encoding)));
- page.getMetadata().put(new Utf8(Metadata.CHAR_ENCODING_FOR_CONVERSION), ByteBuffer.wrap(Bytes.toBytes(encoding)));
+ page.getMetadata().put(new Utf8(Metadata.ORIGINAL_CHAR_ENCODING),
+ ByteBuffer.wrap(Bytes.toBytes(encoding)));
+ page.getMetadata().put(new Utf8(Metadata.CHAR_ENCODING_FOR_CONVERSION),
+ ByteBuffer.wrap(Bytes.toBytes(encoding)));
input.setEncoding(encoding);
- if (LOG.isTraceEnabled()) { LOG.trace("Parsing..."); }
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Parsing...");
+ }
root = parse(input);
} catch (IOException e) {
LOG.error("Failed with the following IOException: ", e);
@@ -218,40 +220,47 @@ public class HtmlParser implements Parse
LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
}
// check meta directives
- if (!metaTags.getNoIndex()) { // okay to index
+ if (!metaTags.getNoIndex()) { // okay to index
StringBuilder sb = new StringBuilder();
- if (LOG.isTraceEnabled()) { LOG.trace("Getting text..."); }
- utils.getText(sb, root); // extract text
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Getting text...");
+ }
+ utils.getText(sb, root); // extract text
text = sb.toString();
sb.setLength(0);
- if (LOG.isTraceEnabled()) { LOG.trace("Getting title..."); }
- utils.getTitle(sb, root); // extract title
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Getting title...");
+ }
+ utils.getTitle(sb, root); // extract title
title = sb.toString().trim();
}
- if (!metaTags.getNoFollow()) { // okay to follow links
- ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
+ if (!metaTags.getNoFollow()) { // okay to follow links
+ ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
URL baseTag = utils.getBase(root);
- if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); }
- utils.getOutlinks(baseTag!=null?baseTag:base, l, root);
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Getting links...");
+ }
+ utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
outlinks = l.toArray(new Outlink[l.size()]);
if (LOG.isTraceEnabled()) {
- LOG.trace("found "+outlinks.length+" outlinks in "+ url);
+ LOG.trace("found " + outlinks.length + " outlinks in " + url);
}
}
ParseStatus status = ParseStatus.newBuilder().build();
- status.setMajorCode((int)ParseStatusCodes.SUCCESS);
+ status.setMajorCode((int) ParseStatusCodes.SUCCESS);
if (metaTags.getRefresh()) {
- status.setMinorCode((int)ParseStatusCodes.SUCCESS_REDIRECT);
+ status.setMinorCode((int) ParseStatusCodes.SUCCESS_REDIRECT);
status.getArgs().add(new Utf8(metaTags.getRefreshHref().toString()));
- status.getArgs().add(new Utf8(Integer.toString(metaTags.getRefreshTime())));
+ status.getArgs().add(
+ new Utf8(Integer.toString(metaTags.getRefreshTime())));
}
Parse parse = new Parse(text, title, outlinks, status);
parse = htmlParseFilters.filter(url, page, parse, metaTags, root);
- if (metaTags.getNoCache()) { // not okay to cache
+ if (metaTags.getNoCache()) { // not okay to cache
page.getMetadata().put(new Utf8(Nutch.CACHING_FORBIDDEN_KEY),
ByteBuffer.wrap(Bytes.toBytes(cachingPolicy)));
}
@@ -262,7 +271,8 @@ public class HtmlParser implements Parse
private DocumentFragment parse(InputSource input) throws Exception {
if (parserImpl.equalsIgnoreCase("tagsoup"))
return parseTagSoup(input);
- else return parseNeko(input);
+ else
+ return parseNeko(input);
}
private DocumentFragment parseTagSoup(InputSource input) throws Exception {
@@ -273,7 +283,8 @@ public class HtmlParser implements Parse
reader.setContentHandler(builder);
reader.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
reader.setFeature(org.ccil.cowan.tagsoup.Parser.bogonsEmptyFeature, false);
- reader.setProperty("http://xml.org/sax/properties/lexical-handler", builder);
+ reader
+ .setProperty("http://xml.org/sax/properties/lexical-handler", builder);
reader.parse(input);
return frag;
}
@@ -281,21 +292,30 @@ public class HtmlParser implements Parse
private DocumentFragment parseNeko(InputSource input) throws Exception {
DOMFragmentParser parser = new DOMFragmentParser();
try {
- parser.setFeature("http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
+ parser
+ .setFeature(
+ "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
true);
parser.setFeature("http://cyberneko.org/html/features/augmentations",
true);
- parser.setProperty("http://cyberneko.org/html/properties/default-encoding",
+ parser.setProperty(
+ "http://cyberneko.org/html/properties/default-encoding",
defaultCharEncoding);
- parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset",
- true);
- parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
- false);
- parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment",
+ parser
+ .setFeature(
+ "http://cyberneko.org/html/features/scanner/ignore-specified-charset",
+ true);
+ parser
+ .setFeature(
+ "http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
+ false);
+ parser.setFeature(
+ "http://cyberneko.org/html/features/balance-tags/document-fragment",
true);
parser.setFeature("http://cyberneko.org/html/features/report-errors",
LOG.isTraceEnabled());
- } catch (SAXException e) {}
+ } catch (SAXException e) {
+ }
// convert Document to DocumentFragment
HTMLDocumentImpl doc = new HTMLDocumentImpl();
doc.setErrorChecking(false);
@@ -305,18 +325,21 @@ public class HtmlParser implements Parse
res.appendChild(frag);
try {
- while(true) {
+ while (true) {
frag = doc.createDocumentFragment();
parser.parse(input, frag);
- if (!frag.hasChildNodes()) break;
+ if (!frag.hasChildNodes())
+ break;
if (LOG.isInfoEnabled()) {
- LOG.info(" - new frag, " + frag.getChildNodes().getLength() + " nodes.");
+ LOG.info(" - new frag, " + frag.getChildNodes().getLength()
+ + " nodes.");
}
res.appendChild(frag);
}
- } catch (Exception x) {
+ } catch (Exception x) {
LOG.error("Failed with the following Exception: ", x);
- };
+ }
+ ;
return res;
}
@@ -341,11 +364,11 @@ public class HtmlParser implements Parse
}
public static void main(String[] args) throws Exception {
- //LOG.setLevel(Level.FINE);
+ // LOG.setLevel(Level.FINE);
String name = args[0];
- String url = "file:"+name;
+ String url = "file:" + name;
File file = new File(name);
- byte[] bytes = new byte[(int)file.length()];
+ byte[] bytes = new byte[(int) file.length()];
DataInputStream in = new DataInputStream(new FileInputStream(file));
in.readFully(bytes);
Configuration conf = NutchConfiguration.create();
@@ -356,8 +379,8 @@ public class HtmlParser implements Parse
page.setContent(ByteBuffer.wrap(bytes));
page.setContentType(new Utf8("text/html"));
Parse parse = parser.getParse(url, page);
- System.out.println("title: "+parse.getTitle());
- System.out.println("text: "+parse.getText());
+ System.out.println("title: " + parse.getTitle());
+ System.out.println("text: " + parse.getText());
System.out.println("outlinks: " + Arrays.toString(parse.getOutlinks()));
}
Modified: nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java (original)
+++ nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java Fri Jan 9 06:34:33 2015
@@ -26,40 +26,42 @@
package org.apache.nutch.parse.html;
/**
- * Class used to verify whether the specified <var>ch</var>
- * conforms to the XML 1.0 definition of whitespace.
+ * Class used to verify whether the specified <var>ch</var> conforms to the XML
+ * 1.0 definition of whitespace.
*/
-public class XMLCharacterRecognizer
-{
+public class XMLCharacterRecognizer {
/**
- * Returns whether the specified <var>ch</var> conforms to the XML 1.0 definition
- * of whitespace. Refer to <A href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S">
- * the definition of <CODE>S</CODE></A> for details.
- * @param ch Character to check as XML whitespace.
+ * Returns whether the specified <var>ch</var> conforms to the XML 1.0
+ * definition of whitespace. Refer to <A
+ * href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S"> the definition of
+ * <CODE>S</CODE></A> for details.
+ *
+ * @param ch
+ * Character to check as XML whitespace.
* @return =true if <var>ch</var> is XML whitespace; otherwise =false.
*/
- public static boolean isWhiteSpace(char ch)
- {
+ public static boolean isWhiteSpace(char ch) {
return (ch == 0x20) || (ch == 0x09) || (ch == 0xD) || (ch == 0xA);
}
/**
* Tell if the string is whitespace.
- *
- * @param ch Character array to check as XML whitespace.
- * @param start Start index of characters in the array
- * @param length Number of characters in the array
- * @return True if the characters in the array are
- * XML whitespace; otherwise, false.
+ *
+ * @param ch
+ * Character array to check as XML whitespace.
+ * @param start
+ * Start index of characters in the array
+ * @param length
+ * Number of characters in the array
+ * @return True if the characters in the array are XML whitespace; otherwise,
+ * false.
*/
- public static boolean isWhiteSpace(char ch[], int start, int length)
- {
+ public static boolean isWhiteSpace(char ch[], int start, int length) {
int end = start + length;
- for (int s = start; s < end; s++)
- {
+ for (int s = start; s < end; s++) {
if (!isWhiteSpace(ch[s]))
return false;
}
@@ -69,39 +71,36 @@ public class XMLCharacterRecognizer
/**
* Tell if the string is whitespace.
- *
- * @param buf StringBuffer to check as XML whitespace.
+ *
+ * @param buf
+ * StringBuffer to check as XML whitespace.
* @return True if characters in buffer are XML whitespace, false otherwise
*/
- public static boolean isWhiteSpace(StringBuffer buf)
- {
+ public static boolean isWhiteSpace(StringBuffer buf) {
int n = buf.length();
- for (int i = 0; i < n; i++)
- {
+ for (int i = 0; i < n; i++) {
if (!isWhiteSpace(buf.charAt(i)))
return false;
}
return true;
}
-
+
/**
* Tell if the string is whitespace.
- *
- * @param s String to check as XML whitespace.
+ *
+ * @param s
+ * String to check as XML whitespace.
* @return True if characters in buffer are XML whitespace, false otherwise
*/
- public static boolean isWhiteSpace(String s)
- {
+ public static boolean isWhiteSpace(String s) {
- if(null != s)
- {
+ if (null != s) {
int n = s.length();
-
- for (int i = 0; i < n; i++)
- {
+
+ for (int i = 0; i < n; i++) {
if (!isWhiteSpace(s.charAt(i)))
return false;
}