You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2010/06/30 12:36:29 UTC

svn commit: r959259 [10/12] - in /nutch/branches/nutchbase: ./ bin/ conf/ contrib/ docs/ ivy/ lib/ lib/jetty-ext/ src/engines/ src/gora/ src/java/ src/java/org/apache/nutch/analysis/ src/java/org/apache/nutch/clustering/ src/java/org/apache/nutch/crawl...

Added: nutch/branches/nutchbase/src/plugin/parse-tika/sample/pdftest.pdf
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/sample/pdftest.pdf?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/sample/pdftest.pdf (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/sample/pdftest.pdf Wed Jun 30 10:36:20 2010
@@ -0,0 +1,157 @@
+%PDF-1.2 
+%����
+ 
+9 0 obj
+<<
+/Length 10 0 R
+/Filter /FlateDecode 
+>>
+stream
+H�͐�J�0�� ��{��f�$M��n�-���[&je���ۤ�~�$���}�Ʌ�Ij���s����~�X�-],��$Y���)�'N�u�1!���V�?��?
+�b1Rbb�҉�H�[��TD:#�&ح��X���i�$qnf�����]������a��{��أ���q|J�Ls]�Q�I��j�%��9��`�঺��U�ite�z�$����OeB�Ēү�R��@zܗ���g���<���
+endstream
+endobj
+10 0 obj
+246
+endobj
+4 0 obj
+<<
+/Type /Page
+/Parent 5 0 R
+/Resources <<
+/Font <<
+/F0 6 0 R 
+/F1 7 0 R 
+>>
+/ProcSet 2 0 R
+>>
+/Contents 9 0 R
+>>
+endobj
+6 0 obj
+<<
+/Type /Font
+/Subtype /TrueType
+/Name /F0
+/BaseFont /Arial
+/Encoding /WinAnsiEncoding
+>>
+endobj
+7 0 obj
+<<
+/Type /Font
+/Subtype /TrueType
+/Name /F1
+/BaseFont /BookAntiqua,Bold
+/FirstChar 31
+/LastChar 255
+/Widths [ 750 250 278 402 606 500 889 833 227 333 333 444 606 250 333 250 
+296 500 500 500 500 500 500 500 500 500 500 250 250 606 606 606 
+444 747 778 667 722 833 611 556 833 833 389 389 778 611 1000 833 
+833 611 833 722 611 667 778 778 1000 667 667 667 333 606 333 606 
+500 333 500 611 444 611 500 389 556 611 333 333 611 333 889 611 
+556 611 611 389 444 333 611 556 833 500 556 500 310 606 310 606 
+750 500 750 333 500 500 1000 500 500 333 1000 611 389 1000 750 750 
+750 750 278 278 500 500 606 500 1000 333 998 444 389 833 750 750 
+667 250 278 500 500 606 500 606 500 333 747 438 500 606 333 747 
+500 400 549 361 361 333 576 641 250 333 361 488 500 889 890 889 
+444 778 778 778 778 778 778 1000 722 611 611 611 611 389 389 389 
+389 833 833 833 833 833 833 833 606 833 778 778 778 778 667 611 
+611 500 500 500 500 500 500 778 444 500 500 500 500 333 333 333 
+333 556 611 556 556 556 556 556 549 556 611 611 611 611 556 611 
+556 ]
+/Encoding /WinAnsiEncoding
+/FontDescriptor 8 0 R
+>>
+endobj
+8 0 obj
+<<
+/Type /FontDescriptor
+/FontName /BookAntiqua,Bold
+/Flags 16418
+/FontBBox [ -250 -260 1236 930 ]
+/MissingWidth 750
+/StemV 146
+/StemH 146
+/ItalicAngle 0
+/CapHeight 930
+/XHeight 651
+/Ascent 930
+/Descent 260
+/Leading 210
+/MaxWidth 1030
+/AvgWidth 460
+>>
+endobj
+2 0 obj
+[ /PDF /Text  ]
+endobj
+5 0 obj
+<<
+/Kids [4 0 R ]
+/Count 1
+/Type /Pages
+/MediaBox [ 0 0 612 792 ]
+>>
+endobj
+1 0 obj
+<<
+/Creator (1725.fm)
+/CreationDate (1-Jan-3 18:15PM)
+/Title (1725.PDF)
+/Author (Unknown)
+/Producer (Acrobat PDFWriter 3.02 for Windows)
+/Keywords ()
+/Subject ()
+>>
+endobj
+3 0 obj
+<<
+/Pages 5 0 R
+/Type /Catalog
+/DefaultGray 11 0 R
+/DefaultRGB  12 0 R
+>>
+endobj
+11 0 obj
+[/CalGray
+<<
+/WhitePoint [0.9505 1 1.0891 ]
+/Gamma 0.2468 
+>>
+]
+endobj
+12 0 obj
+[/CalRGB
+<<
+/WhitePoint [0.9505 1 1.0891 ]
+/Gamma [0.2468 0.2468 0.2468 ]
+/Matrix [0.4361 0.2225 0.0139 0.3851 0.7169 0.0971 0.1431 0.0606 0.7141 ]
+>>
+]
+endobj
+xref
+0 13
+0000000000 65535 f
+0000002172 00000 n
+0000002046 00000 n
+0000002363 00000 n
+0000000375 00000 n
+0000002080 00000 n
+0000000518 00000 n
+0000000633 00000 n
+0000001760 00000 n
+0000000021 00000 n
+0000000352 00000 n
+0000002460 00000 n
+0000002548 00000 n
+trailer
+<<
+/Size 13
+/Root 3 0 R
+/Info 1 0 R
+/ID [<47149510433dd4882f05f8c124223734><47149510433dd4882f05f8c124223734>]
+>>
+startxref
+2726
+%%EOF

Added: nutch/branches/nutchbase/src/plugin/parse-tika/sample/test.rtf
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/sample/test.rtf?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/sample/test.rtf (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/sample/test.rtf Wed Jun 30 10:36:20 2010
@@ -0,0 +1,17 @@
+{\rtf1\ansi\deff1\adeflang1025
+{\fonttbl{\f0\froman\fprq2\fcharset0 Times;}{\f1\froman\fprq2\fcharset0 Times New Roman;}{\f2\fmodern\fprq1\fcharset0 Courier New;}{\f3\froman\fprq2\fcharset0 Times New Roman;}{\f4\fnil\fprq2\fcharset0 Interface User;}{\f5\fnil\fprq2\fcharset0 Lucidasans;}{\f6\fnil\fprq0\fcharset0 Lucidasans;}}
+{\colortbl;\red0\green0\blue0;\red0\green0\blue128;\red128\green128\blue128;}
+{\stylesheet{\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af5\afs24\lang255\ltrch\dbch\af4\afs24\langfe255\loch\f1\fs24\lang1033\snext1 Default;}
+{\s2\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\afs24\lang255\ltrch\dbch\afs24\langfe255\loch\f1\fs24\lang1033\sbasedon1\snext2 Text body;}
+{\s3\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs24\lang255\ltrch\dbch\af1\afs24\langfe255\loch\f1\fs24\lang1033\sbasedon2\snext3 List;}
+{\s4\sb120\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs20\lang255\ai\ltrch\dbch\afs20\langfe255\ai\loch\f1\fs20\lang1033\i\sbasedon1\snext4 Caption;}
+{\s5\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs24\lang255\ltrch\dbch\afs24\langfe255\loch\f1\fs24\lang1033\sbasedon1\snext5 Index;}
+{\*\cs7\cf0\rtlch\af2\afs24\lang255\ltrch\dbch\af2\afs24\langfe255\loch\f2\fs24\lang1033 Teletype;}
+{\*\cs8\cf2\ul\rtlch\afs24\lang255\ltrch\dbch\afs24\langfe255\loch\fs24\lang1033 Internet Link;}
+}
+{\info{\title test rft document}{\subject tests}{\creatim\yr2004\mo9\dy20\hr19\min36}{\revtim\yr1601\mo1\dy1\hr0\min0}{\printim\yr1601\mo1\dy1\hr0\min0}{\comment StarWriter}{\vern6450}}\deftab709
+{\*\pgdsctbl
+{\pgdsc0\pgdscuse195\pgwsxn11905\pghsxn16837\marglsxn1800\margrsxn1800\margtsxn1440\margbsxn1440\pgdscnxt0 Default;}}
+{\*\pgdscno0}\paperh16837\paperw11905\margl1800\margr1800\margt1440\margb1440\sectd\sbknone\pgwsxn11905\pghsxn16837\marglsxn1800\margrsxn1800\margtsxn1440\margbsxn1440\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
+\pard\plain \ltrpar\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\ql\rtlch\af5\afs24\lang255\ltrch\dbch\af4\afs24\langfe255\loch\f1\fs24\lang1033{\loch\f2\fs24\lang1033\i0\b0\*\cs7\cf0\rtlch\ltrch\dbch\loch\f2\fs24\lang1033 The quick brown fox jumps over the lazy dog}
+\par }
\ No newline at end of file

Added: nutch/branches/nutchbase/src/plugin/parse-tika/sample/word97.doc
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/sample/word97.doc?rev=959259&view=auto
==============================================================================
Binary file - no diff available.

Propchange: nutch/branches/nutchbase/src/plugin/parse-tika/sample/word97.doc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java Wed Jun 30 10:36:20 2010
@@ -0,0 +1,740 @@
+/*
+ * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
+ * XXX distribution, org.apache.xml.utils.DOMBuilder, in order to
+ * avoid dependency on Xalan.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Id: DOMBuilder.java 823614 2009-10-09 17:02:32Z ab $
+ */
+package org.apache.nutch.parse.tika;
+
+import java.util.Stack;
+
+import org.w3c.dom.Comment;
+import org.w3c.dom.Document;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.Text;
+import org.w3c.dom.CDATASection;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.ext.LexicalHandler;
+/**
+ * This class takes SAX events (in addition to some extra events
+ * that SAX doesn't handle yet) and adds the result to a document
+ * or document fragment.
+ */
+class DOMBuilder
+        implements ContentHandler, LexicalHandler
+{
+
+  /** Root document          */
+  public Document m_doc;
+
+  /** Current node           */
+  protected Node m_currentNode = null;
+
+  /** First node of document fragment or null if not a DocumentFragment     */
+  public DocumentFragment m_docFrag = null;
+
+  /** Vector of element nodes          */
+  protected Stack m_elemStack = new Stack();
+
+  /**
+   * DOMBuilder instance constructor... it will add the DOM nodes
+   * to the document fragment.
+   *
+   * @param doc Root document
+   * @param node Current node
+   */
+  DOMBuilder(Document doc, Node node)
+  {
+    m_doc = doc;
+    m_currentNode = node;
+  }
+
+  /**
+   * DOMBuilder instance constructor... it will add the DOM nodes
+   * to the document fragment.
+   *
+   * @param doc Root document
+   * @param docFrag Document fragment
+   */
+  DOMBuilder(Document doc, DocumentFragment docFrag)
+  {
+    m_doc = doc;
+    m_docFrag = docFrag;
+  }
+
+  /**
+   * DOMBuilder instance constructor... it will add the DOM nodes
+   * to the document.
+   *
+   * @param doc Root document
+   */
+  DOMBuilder(Document doc)
+  {
+    m_doc = doc;
+  }
+
+  /**
+   * Get the root node of the DOM being created.  This
+   * is either a Document or a DocumentFragment.
+   *
+   * @return The root document or document fragment if not null
+   */
+  Node getRootNode()
+  {
+    return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc;
+  }
+
+  /**
+   * Get the node currently being processed.
+   *
+   * @return the current node being processed
+   */
+  Node getCurrentNode()
+  {
+    return m_currentNode;
+  }
+
+  /**
+   * Return null since there is no Writer for this class.
+   *
+   * @return null
+   */
+  java.io.Writer getWriter()
+  {
+    return null;
+  }
+
+  /**
+   * Append a node to the current container.
+   *
+   * @param newNode New node to append
+   */
+  protected void append(Node newNode) throws org.xml.sax.SAXException
+  {
+
+    Node currentNode = m_currentNode;
+
+    if (null != currentNode)
+    {
+      currentNode.appendChild(newNode);
+
+      // System.out.println(newNode.getNodeName());
+    }
+    else if (null != m_docFrag)
+    {
+      m_docFrag.appendChild(newNode);
+    }
+    else
+    {
+      boolean ok = true;
+      short type = newNode.getNodeType();
+
+      if (type == Node.TEXT_NODE)
+      {
+        String data = newNode.getNodeValue();
+
+        if ((null != data) && (data.trim().length() > 0))
+        {
+          throw new org.xml.sax.SAXException("Warning: can't output text before document element!  Ignoring...");
+        }
+
+        ok = false;
+      }
+      else if (type == Node.ELEMENT_NODE)
+      {
+        if (m_doc.getDocumentElement() != null)
+        {
+          throw new org.xml.sax.SAXException("Can't have more than one root on a DOM!");
+        }
+      }
+
+      if (ok)
+        m_doc.appendChild(newNode);
+    }
+  }
+
+  /**
+   * Receive an object for locating the origin of SAX document events.
+   *
+   * <p>SAX parsers are strongly encouraged (though not absolutely
+   * required) to supply a locator: if it does so, it must supply
+   * the locator to the application by invoking this method before
+   * invoking any of the other methods in the ContentHandler
+   * interface.</p>
+   *
+   * <p>The locator allows the application to determine the end
+   * position of any document-related event, even if the parser is
+   * not reporting an error.  Typically, the application will
+   * use this information for reporting its own errors (such as
+   * character content that does not match an application's
+   * business rules).  The information returned by the locator
+   * is probably not sufficient for use with a search engine.</p>
+   *
+   * <p>Note that the locator will return correct information only
+   * during the invocation of the events in this interface.  The
+   * application should not attempt to use it at any other time.</p>
+   *
+   * @param locator An object that can return the location of
+   *                any SAX document event.
+   * @see org.xml.sax.Locator
+   */
+  public void setDocumentLocator(Locator locator)
+  {
+
+    // No action for the moment.
+  }
+
+  /**
+   * Receive notification of the beginning of a document.
+   *
+   * <p>The SAX parser will invoke this method only once, before any
+   * other methods in this interface or in DTDHandler (except for
+   * setDocumentLocator).</p>
+   */
+  public void startDocument() throws org.xml.sax.SAXException
+  {
+
+    // No action for the moment.
+  }
+
+  /**
+   * Receive notification of the end of a document.
+   *
+   * <p>The SAX parser will invoke this method only once, and it will
+   * be the last method invoked during the parse.  The parser shall
+   * not invoke this method until it has either abandoned parsing
+   * (because of an unrecoverable error) or reached the end of
+   * input.</p>
+   */
+  public void endDocument() throws org.xml.sax.SAXException
+  {
+
+    // No action for the moment.
+  }
+
+  /**
+   * Receive notification of the beginning of an element.
+   *
+   * <p>The Parser will invoke this method at the beginning of every
+   * element in the XML document; there will be a corresponding
+   * endElement() event for every startElement() event (even when the
+   * element is empty). All of the element's content will be
+   * reported, in order, before the corresponding endElement()
+   * event.</p>
+   *
+   * <p>If the element name has a namespace prefix, the prefix will
+   * still be attached.  Note that the attribute list provided will
+   * contain only attributes with explicit values (specified or
+   * defaulted): #IMPLIED attributes will be omitted.</p>
+   *
+   *
+   * @param ns The namespace of the node
+   * @param localName The local part of the qualified name
+   * @param name The element name.
+   * @param atts The attributes attached to the element, if any.
+   * @see #endElement
+   * @see org.xml.sax.Attributes
+   */
+  public void startElement(
+          String ns, String localName, String name, Attributes atts)
+            throws org.xml.sax.SAXException
+  {
+
+    Element elem;
+
+	// Note that the namespace-aware call must be used to correctly
+	// construct a Level 2 DOM, even for non-namespaced nodes.
+    if ((null == ns) || (ns.length() == 0))
+      elem = m_doc.createElementNS(null,name);
+    else
+      elem = m_doc.createElementNS(ns, name);
+
+    append(elem);
+
+    try
+    {
+      int nAtts = atts.getLength();
+
+      if (0 != nAtts)
+      {
+        for (int i = 0; i < nAtts; i++)
+        {
+
+          //System.out.println("type " + atts.getType(i) + " name " + atts.getLocalName(i) );
+          // First handle a possible ID attribute
+          if (atts.getType(i).equalsIgnoreCase("ID"))
+            setIDAttribute(atts.getValue(i), elem);
+
+          String attrNS = atts.getURI(i);
+
+          if("".equals(attrNS))
+            attrNS = null; // DOM represents no-namespace as null
+
+          // System.out.println("attrNS: "+attrNS+", localName: "+atts.getQName(i)
+          //                   +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i));
+          // Crimson won't let us set an xmlns: attribute on the DOM.
+          String attrQName = atts.getQName(i);
+
+          // In SAX, xmlns: attributes have an empty namespace, while in DOM they should have the xmlns namespace
+          if (attrQName.startsWith("xmlns:"))
+            attrNS = "http://www.w3.org/2000/xmlns/";
+
+          // ALWAYS use the DOM Level 2 call!
+          elem.setAttributeNS(attrNS,attrQName, atts.getValue(i));
+        }
+      }
+
+      // append(elem);
+
+      m_elemStack.push(elem);
+
+      m_currentNode = elem;
+
+      // append(elem);
+    }
+    catch(java.lang.Exception de)
+    {
+      // de.printStackTrace();
+      throw new org.xml.sax.SAXException(de);
+    }
+
+  }
+
+  /**
+
+
+
+   * Receive notification of the end of an element.
+   *
+   * <p>The SAX parser will invoke this method at the end of every
+   * element in the XML document; there will be a corresponding
+   * startElement() event for every endElement() event (even when the
+   * element is empty).</p>
+   *
+   * <p>If the element name has a namespace prefix, the prefix will
+   * still be attached to the name.</p>
+   *
+   *
+   * @param ns the namespace of the element
+   * @param localName The local part of the qualified name of the element
+   * @param name The element name
+   */
+  public void endElement(String ns, String localName, String name)
+          throws org.xml.sax.SAXException
+  {
+    m_elemStack.pop();
+    m_currentNode = m_elemStack.isEmpty() ? null : (Node)m_elemStack.peek();
+  }
+
+  /**
+   * Set an ID string to node association in the ID table.
+   *
+   * @param id The ID string.
+   * @param elem The associated ID.
+   */
+  public void setIDAttribute(String id, Element elem)
+  {
+
+    // Do nothing. This method is meant to be overiden.
+  }
+
+  /**
+   * Receive notification of character data.
+   *
+   * <p>The Parser will call this method to report each chunk of
+   * character data.  SAX parsers may return all contiguous character
+   * data in a single chunk, or they may split it into several
+   * chunks; however, all of the characters in any single event
+   * must come from the same external entity, so that the Locator
+   * provides useful information.</p>
+   *
+   * <p>The application must not attempt to read from the array
+   * outside of the specified range.</p>
+   *
+   * <p>Note that some parsers will report whitespace using the
+   * ignorableWhitespace() method rather than this one (validating
+   * parsers must do so).</p>
+   *
+   * @param ch The characters from the XML document.
+   * @param start The start position in the array.
+   * @param length The number of characters to read from the array.
+   * @see #ignorableWhitespace
+   * @see org.xml.sax.Locator
+   */
+  public void characters(char ch[], int start, int length) throws org.xml.sax.SAXException
+  {
+    if(isOutsideDocElem()
+       && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return;  // avoid DOM006 Hierarchy request error
+
+    if (m_inCData)
+    {
+      cdata(ch, start, length);
+
+      return;
+    }
+
+    String s = new String(ch, start, length);
+    Node childNode;
+    childNode =  m_currentNode != null ? m_currentNode.getLastChild(): null;
+    if( childNode != null && childNode.getNodeType() == Node.TEXT_NODE ){
+       ((Text)childNode).appendData(s);
+    }
+    else{
+       Text text = m_doc.createTextNode(s);
+       append(text);
+    }
+  }
+
+  /**
+   * If available, when the disable-output-escaping attribute is used,
+   * output raw text without escaping.  A PI will be inserted in front
+   * of the node with the name "lotusxsl-next-is-raw" and a value of
+   * "formatter-to-dom".
+   *
+   * @param ch Array containing the characters
+   * @param start Index to start of characters in the array
+   * @param length Number of characters in the array
+   */
+  public void charactersRaw(char ch[], int start, int length)
+          throws org.xml.sax.SAXException
+  {
+    if(isOutsideDocElem()
+       && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return;  // avoid DOM006 Hierarchy request error
+
+
+    String s = new String(ch, start, length);
+
+    append(m_doc.createProcessingInstruction("xslt-next-is-raw",
+                                             "formatter-to-dom"));
+    append(m_doc.createTextNode(s));
+  }
+
+  /**
+   * Report the beginning of an entity.
+   *
+   * The start and end of the document entity are not reported.
+   * The start and end of the external DTD subset are reported
+   * using the pseudo-name "[dtd]".  All other events must be
+   * properly nested within start/end entity events.
+   *
+   * @param name The name of the entity.  If it is a parameter
+   *        entity, the name will begin with '%'.
+   * @see #endEntity
+   * @see org.xml.sax.ext.DeclHandler#internalEntityDecl
+   * @see org.xml.sax.ext.DeclHandler#externalEntityDecl
+   */
+  public void startEntity(String name) throws org.xml.sax.SAXException
+  {
+
+    // Almost certainly the wrong behavior...
+    // entityReference(name);
+  }
+
+  /**
+   * Report the end of an entity.
+   *
+   * @param name The name of the entity that is ending.
+   * @see #startEntity
+   */
+  public void endEntity(String name) throws org.xml.sax.SAXException{}
+
+  /**
+   * Receive notivication of a entityReference.
+   *
+   * @param name name of the entity reference
+   */
+  public void entityReference(String name) throws org.xml.sax.SAXException
+  {
+    append(m_doc.createEntityReference(name));
+  }
+
+  /**
+   * Receive notification of ignorable whitespace in element content.
+   *
+   * <p>Validating Parsers must use this method to report each chunk
+   * of ignorable whitespace (see the W3C XML 1.0 recommendation,
+   * section 2.10): non-validating parsers may also use this method
+   * if they are capable of parsing and using content models.</p>
+   *
+   * <p>SAX parsers may return all contiguous whitespace in a single
+   * chunk, or they may split it into several chunks; however, all of
+   * the characters in any single event must come from the same
+   * external entity, so that the Locator provides useful
+   * information.</p>
+   *
+   * <p>The application must not attempt to read from the array
+   * outside of the specified range.</p>
+   *
+   * @param ch The characters from the XML document.
+   * @param start The start position in the array.
+   * @param length The number of characters to read from the array.
+   * @see #characters
+   */
+  public void ignorableWhitespace(char ch[], int start, int length)
+          throws org.xml.sax.SAXException
+  {
+    if(isOutsideDocElem())
+      return;  // avoid DOM006 Hierarchy request error
+
+    String s = new String(ch, start, length);
+
+    append(m_doc.createTextNode(s));
+  }
+
+  /**
+   * Tell if the current node is outside the document element.
+   *
+   * @return true if the current node is outside the document element.
+   */
+   private boolean isOutsideDocElem()
+   {
+      return (null == m_docFrag) && m_elemStack.size() == 0 && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE);
+   }
+
+  /**
+   * Receive notification of a processing instruction.
+   *
+   * <p>The Parser will invoke this method once for each processing
+   * instruction found: note that processing instructions may occur
+   * before or after the main document element.</p>
+   *
+   * <p>A SAX parser should never report an XML declaration (XML 1.0,
+   * section 2.8) or a text declaration (XML 1.0, section 4.3.1)
+   * using this method.</p>
+   *
+   * @param target The processing instruction target.
+   * @param data The processing instruction data, or null if
+   *        none was supplied.
+   */
+  public void processingInstruction(String target, String data)
+          throws org.xml.sax.SAXException
+  {
+    append(m_doc.createProcessingInstruction(target, data));
+  }
+
+  /**
+   * Report an XML comment anywhere in the document.
+   *
+   * This callback will be used for comments inside or outside the
+   * document element, including comments in the external DTD
+   * subset (if read).
+   *
+   * @param ch An array holding the characters in the comment.
+   * @param start The starting position in the array.
+   * @param length The number of characters to use from the array.
+   */
+  public void comment(char ch[], int start, int length) throws org.xml.sax.SAXException
+  {
+    // tagsoup sometimes submits invalid values here
+    if (ch == null || start < 0 || length >= (ch.length - start) || length < 0) return;
+    append(m_doc.createComment(new String(ch, start, length)));
+  }
+
+  /** Flag indicating that we are processing a CData section          */
+  protected boolean m_inCData = false;
+
+  /**
+   * Report the start of a CDATA section.
+   *
+   * @see #endCDATA
+   */
+  public void startCDATA() throws org.xml.sax.SAXException
+  {
+    m_inCData = true;
+    append(m_doc.createCDATASection(""));
+  }
+
+  /**
+   * Report the end of a CDATA section.
+   *
+   * @see #startCDATA
+   */
+  public void endCDATA() throws org.xml.sax.SAXException
+  {
+    m_inCData = false;
+  }
+
+  /**
+   * Receive notification of cdata.
+   *
+   * <p>The Parser will call this method to report each chunk of
+   * character data.  SAX parsers may return all contiguous character
+   * data in a single chunk, or they may split it into several
+   * chunks; however, all of the characters in any single event
+   * must come from the same external entity, so that the Locator
+   * provides useful information.</p>
+   *
+   * <p>The application must not attempt to read from the array
+   * outside of the specified range.</p>
+   *
+   * <p>Note that some parsers will report whitespace using the
+   * ignorableWhitespace() method rather than this one (validating
+   * parsers must do so).</p>
+   *
+   * @param ch The characters from the XML document.
+   * @param start The start position in the array.
+   * @param length The number of characters to read from the array.
+   * @see #ignorableWhitespace
+   * @see org.xml.sax.Locator
+   */
+  public void cdata(char ch[], int start, int length) throws org.xml.sax.SAXException
+  {
+    if(isOutsideDocElem()
+       && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return;  // avoid DOM006 Hierarchy request error
+
+    String s = new String(ch, start, length);
+
+    // XXX ab@apache.org: modified from the original, to accomodate TagSoup. 
+    Node n = m_currentNode.getLastChild();
+    if (n instanceof CDATASection)
+      ((CDATASection)n).appendData(s);
+    else if (n instanceof Comment)
+      ((Comment)n).appendData(s);
+  }
+
+  /**
+   * Report the start of DTD declarations, if any.
+   *
+   * Any declarations are assumed to be in the internal subset
+   * unless otherwise indicated.
+   *
+   * @param name The document type name.
+   * @param publicId The declared public identifier for the
+   *        external DTD subset, or null if none was declared.
+   * @param systemId The declared system identifier for the
+   *        external DTD subset, or null if none was declared.
+   * @see #endDTD
+   * @see #startEntity
+   */
+  public void startDTD(String name, String publicId, String systemId)
+          throws org.xml.sax.SAXException
+  {
+
+    // Do nothing for now.
+  }
+
+  /**
+   * Report the end of DTD declarations.
+   *
+   * @see #startDTD
+   */
+  public void endDTD() throws org.xml.sax.SAXException
+  {
+
+    // Do nothing for now.
+  }
+
+  /**
+   * Begin the scope of a prefix-URI Namespace mapping.
+   *
+   * <p>The information from this event is not necessary for
+   * normal Namespace processing: the SAX XML reader will
+   * automatically replace prefixes for element and attribute
+   * names when the http://xml.org/sax/features/namespaces
+   * feature is true (the default).</p>
+   *
+   * <p>There are cases, however, when applications need to
+   * use prefixes in character data or in attribute values,
+   * where they cannot safely be expanded automatically; the
+   * start/endPrefixMapping event supplies the information
+   * to the application to expand prefixes in those contexts
+   * itself, if necessary.</p>
+   *
+   * <p>Note that start/endPrefixMapping events are not
+   * guaranteed to be properly nested relative to each-other:
+   * all startPrefixMapping events will occur before the
+   * corresponding startElement event, and all endPrefixMapping
+   * events will occur after the corresponding endElement event,
+   * but their order is not guaranteed.</p>
+   *
+   * @param prefix The Namespace prefix being declared.
+   * @param uri The Namespace URI the prefix is mapped to.
+   * @see #endPrefixMapping
+   * @see #startElement
+   */
+  public void startPrefixMapping(String prefix, String uri)
+          throws org.xml.sax.SAXException
+  {
+
+    /*
+    // Not sure if this is needed or wanted
+    // Also, it fails in the stree.
+    if((null != m_currentNode)
+       && (m_currentNode.getNodeType() == Node.ELEMENT_NODE))
+    {
+      String qname;
+      if(((null != prefix) && (prefix.length() == 0))
+         || (null == prefix))
+        qname = "xmlns";
+      else
+        qname = "xmlns:"+prefix;
+
+      Element elem = (Element)m_currentNode;
+      String val = elem.getAttribute(qname); // Obsolete, should be DOM2...?
+      if(val == null)
+      {
+        elem.setAttributeNS("http://www.w3.org/XML/1998/namespace",
+                            qname, uri);
+      }
+    }
+    */
+  }
+
+  /**
+   * End the scope of a prefix-URI mapping.
+   *
+   * <p>See startPrefixMapping for details.  This event will
+   * always occur after the corresponding endElement event,
+   * but the order of endPrefixMapping events is not otherwise
+   * guaranteed.</p>
+   *
+   * @param prefix The prefix that was being mapping.
+   * @see #startPrefixMapping
+   * @see #endElement
+   */
+  public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException{}
+
+  /**
+   * Receive notification of a skipped entity.
+   *
+   * <p>The Parser will invoke this method once for each entity
+   * skipped.  Non-validating processors may skip entities if they
+   * have not seen the declarations (because, for example, the
+   * entity was declared in an external DTD subset).  All processors
+   * may skip external entities, depending on the values of the
+   * http://xml.org/sax/features/external-general-entities and the
+   * http://xml.org/sax/features/external-parameter-entities
+   * properties.</p>
+   *
+   * @param name The name of the skipped entity.  If it is a
+   *        parameter entity, the name will begin with '%'.
+   */
+  public void skippedEntity(String name) throws org.xml.sax.SAXException{}
+}

Added: nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java Wed Jun 30 10:36:20 2010
@@ -0,0 +1,419 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.tika;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.util.NodeWalker;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+/**
+ * A collection of methods for extracting content from DOM trees.
+ * 
+ * This class holds a few utility methods for pulling content out of 
+ * DOM nodes, such as getOutlinks, getText, etc.
+ *
+ */
+class DOMContentUtils {
+
+  private static class LinkParams {
+	private String elName;
+	private String attrName;
+	private int childLen;
+      
+	private LinkParams(String elName, String attrName, int childLen) {
+          this.elName = elName;
+          this.attrName = attrName;
+          this.childLen = childLen;
+      }
+      
+	public String toString() {
+          return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]";
+      }
+  }
+  
+  private HashMap linkParams = new HashMap();
+  private Configuration conf;
+  
+  DOMContentUtils(Configuration conf) {
+    setConf(conf);
+  }
+  
+  private void setConf(Configuration conf) {
+    // forceTags is used to override configurable tag ignoring, later on
+    Collection<String> forceTags = new ArrayList<String>(1);
+
+    this.conf = conf;
+    linkParams.clear();
+    linkParams.put("a", new LinkParams("a", "href", 1));
+    linkParams.put("area", new LinkParams("area", "href", 0));
+    if (conf.getBoolean("parser.html.form.use_action", true)) {
+      linkParams.put("form", new LinkParams("form", "action", 1));
+      if (conf.get("parser.html.form.use_action") != null)
+        forceTags.add("form");
+    }
+    linkParams.put("frame", new LinkParams("frame", "src", 0));
+    linkParams.put("iframe", new LinkParams("iframe", "src", 0));
+    linkParams.put("script", new LinkParams("script", "src", 0));
+    linkParams.put("link", new LinkParams("link", "href", 0));
+    linkParams.put("img", new LinkParams("img", "src", 0));
+
+    // remove unwanted link tags from the linkParams map
+    String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags");
+    for ( int i = 0 ; ignoreTags != null && i < ignoreTags.length ; i++ ) {
+      if ( ! forceTags.contains(ignoreTags[i]) )
+        linkParams.remove(ignoreTags[i]);
+    }
+  }
+  
+  /**
+   * This method takes a {@link StringBuffer} and a DOM {@link Node},
+   * and will append all the content text found beneath the DOM node to 
+   * the <code>StringBuffer</code>.
+   *
+   * <p>
+   *
+   * If <code>abortOnNestedAnchors</code> is true, DOM traversal will
+   * be aborted and the <code>StringBuffer</code> will not contain
+   * any text encountered after a nested anchor is found.
+   * 
+   * <p>
+   *
+   * @return true if nested anchors were found
+   */
+  private boolean getText(StringBuffer sb, Node node, 
+                                      boolean abortOnNestedAnchors) {
+    if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
+      return true;
+    } 
+    return false;
+  }
+
+
+  /**
+   * This is a convinience method, equivalent to {@link
+   * #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
+   * 
+   */
+  void getText(StringBuffer sb, Node node) {
+    getText(sb, node, false);
+  }
+
+  // returns true if abortOnNestedAnchors is true and we find nested 
+  // anchors
+  private boolean getTextHelper(StringBuffer sb, Node node, 
+                                             boolean abortOnNestedAnchors,
+                                             int anchorDepth) {
+    boolean abort = false;
+    NodeWalker walker = new NodeWalker(node);
+    
+    while (walker.hasNext()) {
+    
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+      
+      if ("script".equalsIgnoreCase(nodeName)) {
+        walker.skipChildren();
+      }
+      if ("style".equalsIgnoreCase(nodeName)) {
+        walker.skipChildren();
+      }
+      if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
+        anchorDepth++;
+        if (anchorDepth > 1) {
+          abort = true;
+          break;
+        }        
+      }
+      if (nodeType == Node.COMMENT_NODE) {
+        walker.skipChildren();
+      }
+      if (nodeType == Node.TEXT_NODE) {
+        // cleanup and trim the value
+        String text = currentNode.getNodeValue();
+        text = text.replaceAll("\\s+", " ");
+        text = text.trim();
+        if (text.length() > 0) {
+          if (sb.length() > 0) sb.append(' ');
+        	sb.append(text);
+        }
+      }
+    }
+    
+    return abort;
+  }
+
+  /**
+   * This method takes a {@link StringBuffer} and a DOM {@link Node},
+   * and will append the content text found beneath the first
+   * <code>title</code> node to the <code>StringBuffer</code>.
+   *
+   * @return true if a title node was found, false otherwise
+   */
+  boolean getTitle(StringBuffer sb, Node node) {
+    
+    NodeWalker walker = new NodeWalker(node);
+    
+    while (walker.hasNext()) {
+  
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+      
+      if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
+        return false;
+      }
+  
+      if (nodeType == Node.ELEMENT_NODE) {
+        if ("title".equalsIgnoreCase(nodeName)) {
+          getText(sb, currentNode);
+          return true;
+        }
+      }
+    }      
+    
+    return false;
+  }
+
+  /** If Node contains a BASE tag then it's HREF is returned. */
+  URL getBase(Node node) {
+
+    NodeWalker walker = new NodeWalker(node);
+    
+    while (walker.hasNext()) {
+  
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+      
+      // is this node a BASE tag?
+      if (nodeType == Node.ELEMENT_NODE) {
+  
+        if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
+          return null;
+        }
+  
+        if ("base".equalsIgnoreCase(nodeName)) {
+          NamedNodeMap attrs = currentNode.getAttributes();
+          for (int i= 0; i < attrs.getLength(); i++ ) {
+            Node attr = attrs.item(i);
+            if ("href".equalsIgnoreCase(attr.getNodeName())) {
+              try {
+                return new URL(attr.getNodeValue());
+              } catch (MalformedURLException e) {}
+            }
+          }
+        }
+      }
+    }
+
+    // no.
+    return null;
+  }
+
+
+  private boolean hasOnlyWhiteSpace(Node node) {
+    String val= node.getNodeValue();
+    for (int i= 0; i < val.length(); i++) {
+      if (!Character.isWhitespace(val.charAt(i)))
+        return false;
+    }
+    return true;
+  }
+
+  // this only covers a few cases of empty links that are symptomatic
+  // of nekohtml's DOM-fixup process...
+  private boolean shouldThrowAwayLink(Node node, NodeList children, 
+                                              int childLen, LinkParams params) {
+    if (childLen == 0) {
+      // this has no inner structure 
+      if (params.childLen == 0) return false;
+      else return true;
+    } else if ((childLen == 1) 
+               && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
+               && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { 
+      // single nested link
+      return true;
+
+    } else if (childLen == 2) {
+
+      Node c0= children.item(0);
+      Node c1= children.item(1);
+
+      if ((c0.getNodeType() == Node.ELEMENT_NODE)
+          && (params.elName.equalsIgnoreCase(c0.getNodeName()))
+          && (c1.getNodeType() == Node.TEXT_NODE) 
+          && hasOnlyWhiteSpace(c1) ) {
+        // single link followed by whitespace node
+        return true;
+      }
+
+      if ((c1.getNodeType() == Node.ELEMENT_NODE)
+          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
+          && (c0.getNodeType() == Node.TEXT_NODE) 
+          && hasOnlyWhiteSpace(c0) ) {
+        // whitespace node followed by single link
+        return true;
+      }
+
+    } else if (childLen == 3) {
+      Node c0= children.item(0);
+      Node c1= children.item(1);
+      Node c2= children.item(2);
+      
+      if ((c1.getNodeType() == Node.ELEMENT_NODE)
+          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
+          && (c0.getNodeType() == Node.TEXT_NODE) 
+          && (c2.getNodeType() == Node.TEXT_NODE) 
+          && hasOnlyWhiteSpace(c0)
+          && hasOnlyWhiteSpace(c2) ) {
+        // single link surrounded by whitespace nodes
+        return true;
+      }
+    }
+
+    return false;
+  }
+  
+  /**
+   * Handles cases where the url param information is encoded into the base
+   * url as opposed to the target.
+   * <p>
+   * If the taget contains params (i.e. ';xxxx') information then the target 
+   * params information is assumed to be correct and any base params information
+   * is ignored.  If the base contains params information but the tareget does
+   * not, then the params information is moved to the target allowing it to be
+   * correctly determined by the java.net.URL class.
+   * 
+   * @param base The base URL.
+   * @param target The target path from the base URL.
+   * 
+   * @return URL A URL with the params information correctly encoded.
+   * 
+   * @throws MalformedURLException If the url is not a well formed URL.
+   */
+  private URL fixEmbeddedParams(URL base, String target) 
+    throws MalformedURLException{
+    
+    // the target contains params information or the base doesn't then no
+    // conversion necessary, return regular URL
+    if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
+      return new URL(base, target);
+    }
+    
+    // get the base url and it params information
+    String baseURL = base.toString();
+    int startParams = baseURL.indexOf(';');
+    String params = baseURL.substring(startParams);
+    
+    // if the target has a query string then put the params information after
+    // any path but before the query string, otherwise just append to the path
+    int startQS = target.indexOf('?');
+    if (startQS >= 0) {
+      target = target.substring(0, startQS) + params + 
+        target.substring(startQS);
+    }
+    else {
+      target += params;
+    }
+    
+    return new URL(base, target);
+  }
+
+  /**
+   * This method finds all anchors below the supplied DOM
+   * <code>node</code>, and creates appropriate {@link Outlink}
+   * records for each (relative to the supplied <code>base</code>
+   * URL), and adds them to the <code>outlinks</code> {@link
+   * ArrayList}.
+   *
+   * <p>
+   *
+   * Links without inner structure (tags, text, etc) are discarded, as
+   * are links which contain only single nested links and empty text
+   * nodes (this is a common DOM-fixup artifact, at least with
+   * nekohtml).
+   */
+  void getOutlinks(URL base, ArrayList outlinks, 
+                                       Node node) {
+    
+    NodeWalker walker = new NodeWalker(node);
+    while (walker.hasNext()) {
+      
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();      
+      NodeList children = currentNode.getChildNodes();
+      int childLen = (children != null) ? children.getLength() : 0; 
+      
+      if (nodeType == Node.ELEMENT_NODE) {
+        
+        nodeName = nodeName.toLowerCase();
+        LinkParams params = (LinkParams)linkParams.get(nodeName);
+        if (params != null) {
+          if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
+  
+            StringBuffer linkText = new StringBuffer();
+            getText(linkText, currentNode, true);
+  
+            NamedNodeMap attrs = currentNode.getAttributes();
+            String target = null;
+            boolean noFollow = false;
+            boolean post = false;
+            for (int i= 0; i < attrs.getLength(); i++ ) {
+              Node attr = attrs.item(i);
+              String attrName = attr.getNodeName();
+              if (params.attrName.equalsIgnoreCase(attrName)) {
+                target = attr.getNodeValue();
+              } else if ("rel".equalsIgnoreCase(attrName) &&
+                         "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+                noFollow = true;
+              } else if ("method".equalsIgnoreCase(attrName) &&
+                         "post".equalsIgnoreCase(attr.getNodeValue())) {
+                post = true;
+              }
+            }
+            if (target != null && !noFollow && !post)
+              try {
+                
+                URL url = (base.toString().indexOf(';') > 0) ? 
+                  fixEmbeddedParams(base, target) :  new URL(base, target);
+                outlinks.add(new Outlink(url.toString(),
+                                         linkText.toString().trim()));
+              } catch (MalformedURLException e) {
+                // don't care
+              }
+          }
+          // this should not have any children, skip them
+          if (params.childLen == 0) continue;
+        }
+      }
+    }
+  }
+
+}
+

Added: nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java Wed Jun 30 10:36:20 2010
@@ -0,0 +1,213 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.tika;
+
+import java.net.URL;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.w3c.dom.*;
+
+/**
+ * Class for parsing META Directives from DOM trees.  This class
+ * handles specifically Robots META directives (all, none, nofollow,
+ * noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache
+ * instructions. All meta directives are stored in a HTMLMetaTags instance.
+ */
+class HTMLMetaProcessor {
+
+  /**
+   * Utility class with indicators for the robots directives "noindex"
+   * and "nofollow", and HTTP-EQUIV/no-cache
+   */
+  
+  /**
+   * Sets the indicators in <code>robotsMeta</code> to appropriate
+   * values, based on any META tags found under the given
+   * <code>node</code>.
+   */
+  static final void getMetaTags (
+    HTMLMetaTags metaTags, Node node, URL currURL) {
+
+    metaTags.reset();
+    getMetaTagsHelper(metaTags, node, currURL);
+  }
+
+  private static final void getMetaTagsHelper(
+    HTMLMetaTags metaTags, Node node, URL currURL) {
+
+    if (node.getNodeType() == Node.ELEMENT_NODE) {
+
+      if ("body".equalsIgnoreCase(node.getNodeName())) {
+        // META tags should not be under body
+        return;
+      }
+
+      if ("meta".equalsIgnoreCase(node.getNodeName())) {
+        NamedNodeMap attrs = node.getAttributes();
+        Node nameNode = null;
+        Node equivNode = null;
+        Node contentNode = null;
+        // Retrieves name, http-equiv and content attribues
+        for (int i=0; i<attrs.getLength(); i++) {
+          Node attr = attrs.item(i);
+          String attrName = attr.getNodeName().toLowerCase();
+          if (attrName.equals("name")) {
+            nameNode = attr;
+          } else if (attrName.equals("http-equiv")) {
+            equivNode = attr;
+          } else if (attrName.equals("content")) {
+            contentNode = attr;
+          }
+        }
+        
+        if (nameNode != null) {
+          if (contentNode != null) {
+            String name = nameNode.getNodeValue().toLowerCase();
+            metaTags.getGeneralTags().setProperty(name, contentNode.getNodeValue());
+            if ("robots".equals(name)) {
+  
+              if (contentNode != null) {
+                String directives = 
+                  contentNode.getNodeValue().toLowerCase();
+                int index = directives.indexOf("none");
+  
+                if (index >= 0) {
+                  metaTags.setNoIndex();
+                  metaTags.setNoFollow();
+                }
+  
+                index = directives.indexOf("all");
+                if (index >= 0) {
+                  // do nothing...
+                }
+  
+                index = directives.indexOf("noindex");
+                if (index >= 0) {
+                  metaTags.setNoIndex();
+                }
+  
+                index = directives.indexOf("nofollow");
+                if (index >= 0) {
+                  metaTags.setNoFollow();
+                }
+                
+                index = directives.indexOf("noarchive");
+                if (index >= 0) {
+                  metaTags.setNoCache();
+                }
+              } 
+  
+            } // end if (name == robots)
+          }
+        }
+
+        if (equivNode != null) {
+          if (contentNode != null) {
+            String name = equivNode.getNodeValue().toLowerCase();
+            String content = contentNode.getNodeValue();
+            metaTags.getHttpEquivTags().setProperty(name, content);
+            if ("pragma".equals(name)) {
+              content = content.toLowerCase();
+              int index = content.indexOf("no-cache");
+              if (index >= 0) 
+                metaTags.setNoCache();
+            } else if ("refresh".equals(name)) {
+              int idx = content.indexOf(';');
+              String time = null;
+              if (idx == -1) { // just the refresh time
+                time = content;
+              } else time = content.substring(0, idx);
+              try {
+                metaTags.setRefreshTime(Integer.parseInt(time));
+                // skip this if we couldn't parse the time
+                metaTags.setRefresh(true);
+              } catch (Exception e) {
+                ;
+              }
+              URL refreshUrl = null;
+              if (metaTags.getRefresh() && idx != -1) { // set the URL
+                idx = content.toLowerCase().indexOf("url=");
+                if (idx == -1) { // assume a mis-formatted entry with just the url
+                  idx = content.indexOf(';') + 1;
+                } else idx += 4;
+                if (idx != -1) {
+                  String url = content.substring(idx);
+                  try {
+                    refreshUrl = new URL(url);
+                  } catch (Exception e) {
+                    // XXX according to the spec, this has to be an absolute
+                    // XXX url. However, many websites use relative URLs and
+                    // XXX expect browsers to handle that.
+                    // XXX Unfortunately, in some cases this may create a
+                    // XXX infinitely recursive paths (a crawler trap)...
+                    // if (!url.startsWith("/")) url = "/" + url;
+                    try {
+                      refreshUrl = new URL(currURL, url);
+                    } catch (Exception e1) {
+                      refreshUrl = null;
+                    }
+                  }
+                }
+              }
+              if (metaTags.getRefresh()) {
+                if (refreshUrl == null) {
+                  // apparently only refresh time was present. set the URL
+                  // to the same URL.
+                  refreshUrl = currURL;
+                }
+                metaTags.setRefreshHref(refreshUrl);
+              }
+            }
+          }
+        }
+
+      } else if ("base".equalsIgnoreCase(node.getNodeName())) {
+        NamedNodeMap attrs = node.getAttributes();
+        Node hrefNode = attrs.getNamedItem("href");
+
+        if (hrefNode != null) {
+          String urlString = hrefNode.getNodeValue();
+
+          URL url = null;
+          try {
+            if (currURL == null)
+              url = new URL(urlString);
+            else 
+              url = new URL(currURL, urlString);
+          } catch (Exception e) {
+            ;
+          }
+
+          if (url != null) 
+            metaTags.setBaseHref(url);
+        }
+
+      }
+
+    }
+
+    NodeList children = node.getChildNodes();
+    if (children != null) {
+      int len = children.getLength();
+      for (int i = 0; i < len; i++) {
+        getMetaTagsHelper(metaTags, children.item(i), currURL);
+      }
+    }
+  }
+
+}

Added: nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java Wed Jun 30 10:36:20 2010
@@ -0,0 +1,250 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+import javax.imageio.spi.ServiceRegistry;
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.mime.MimeTypesFactory;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+/**
+ * Parse xml config file.
+ */
+public class TikaConfig {
+
+    private final Map<String, Parser> parsers = new HashMap<String, Parser>();
+
+    private final MimeTypes mimeTypes;
+
+    public TikaConfig(String file) throws TikaException, IOException,
+	    SAXException {
+	this(new File(file));
+    }
+
+    public TikaConfig(File file) throws TikaException, IOException,
+	    SAXException {
+	this(getBuilder().parse(file));
+    }
+
+    public TikaConfig(URL url) throws TikaException, IOException, SAXException {
+	this(getBuilder().parse(url.toString()));
+    }
+
+    public TikaConfig(InputStream stream) throws TikaException, IOException,
+	    SAXException {
+	this(getBuilder().parse(stream));
+    }
+
+    /**
+     * @deprecated This method will be removed in Apache Tika 1.0
+     * @see <a
+     *      href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
+     */
+    public TikaConfig(InputStream stream, Parser delegate)
+	    throws TikaException, IOException, SAXException {
+	this(stream);
+    }
+
+    public TikaConfig(Document document) throws TikaException, IOException {
+	this(document.getDocumentElement());
+    }
+
+    /**
+     * @deprecated This method will be removed in Apache Tika 1.0
+     * @see <a
+     *      href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
+     */
+    public TikaConfig(Document document, Parser delegate) throws TikaException,
+	    IOException {
+	this(document);
+    }
+
+    public TikaConfig(Element element) throws TikaException, IOException {
+	Element mtr = getChild(element, "mimeTypeRepository");
+	if (mtr != null && mtr.hasAttribute("resource")) {
+	    mimeTypes = MimeTypesFactory.create(mtr.getAttribute("resource"));
+	} else {
+	    mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml");
+	}
+
+	NodeList nodes = element.getElementsByTagName("parser");
+	for (int i = 0; i < nodes.getLength(); i++) {
+	    Element node = (Element) nodes.item(i);
+	    String name = node.getAttribute("class");
+
+	    try {
+		Class<?> parserClass = Class.forName(name);
+		Object instance = parserClass.newInstance();
+		if (!(instance instanceof Parser)) {
+		    throw new TikaException(
+			    "Configured class is not a Tika Parser: " + name);
+		}
+		Parser parser = (Parser) instance;
+
+		NodeList mimes = node.getElementsByTagName("mime");
+		if (mimes.getLength() > 0) {
+		    for (int j = 0; j < mimes.getLength(); j++) {
+			parsers.put(getText(mimes.item(j)).trim(), parser);
+		    }
+		} else {
+		    ParseContext context = new ParseContext();
+		    for (MediaType type : parser.getSupportedTypes(context)) {
+			parsers.put(type.toString(), parser);
+		    }
+		}
+	    } catch (ClassNotFoundException e) {
+		throw new TikaException("Configured parser class not found: "
+			+ name, e);
+	    } catch (IllegalAccessException e) {
+		throw new TikaException("Unable to access a parser class: "
+			+ name, e);
+	    } catch (InstantiationException e) {
+		throw new TikaException(
+			"Unable to instantiate a parser class: " + name, e);
+	    }
+	}
+    }
+
+    public TikaConfig() throws MimeTypeException, IOException {
+	ParseContext context = new ParseContext();
+	Iterator<Parser> iterator = ServiceRegistry.lookupProviders(
+		Parser.class, this.getClass().getClassLoader());
+	while (iterator.hasNext()) {
+	    Parser parser = iterator.next();
+	    for (MediaType type : parser.getSupportedTypes(context)) {
+		parsers.put(type.toString(), parser);
+	    }
+	}
+	mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml");
+    }
+
+    /**
+     * @deprecated This method will be removed in Apache Tika 1.0
+     * @see <a
+     *      href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
+     */
+    public TikaConfig(Element element, Parser delegate) throws TikaException,
+	    IOException {
+	this(element);
+    }
+
+    private String getText(Node node) {
+	if (node.getNodeType() == Node.TEXT_NODE) {
+	    return node.getNodeValue();
+	} else if (node.getNodeType() == Node.ELEMENT_NODE) {
+	    StringBuilder builder = new StringBuilder();
+	    NodeList list = node.getChildNodes();
+	    for (int i = 0; i < list.getLength(); i++) {
+		builder.append(getText(list.item(i)));
+	    }
+	    return builder.toString();
+	} else {
+	    return "";
+	}
+    }
+
+    /**
+     * Returns the parser instance configured for the given MIME type. Returns
+     * <code>null</code> if the given MIME type is unknown.
+     * 
+     * @param mimeType
+     *            MIME type
+     * @return configured Parser instance, or <code>null</code>
+     */
+    public Parser getParser(String mimeType) {
+	return parsers.get(mimeType);
+    }
+
+    public Map<String, Parser> getParsers() {
+	return parsers;
+    }
+
+    public MimeTypes getMimeRepository() {
+	return mimeTypes;
+    }
+
+    /**
+     * Provides a default configuration (TikaConfig). Currently creates a new
+     * instance each time it's called; we may be able to have it return a shared
+     * instance once it is completely immutable.
+     * 
+     * @return default configuration
+     */
+    public static TikaConfig getDefaultConfig() {
+	try {
+	    return new TikaConfig();
+	} catch (IOException e) {
+	    throw new RuntimeException("Unable to read default configuration",
+		    e);
+	} catch (TikaException e) {
+	    throw new RuntimeException(
+		    "Unable to access default configuration", e);
+	}
+    }
+
+    /**
+     * @deprecated This method will be removed in Apache Tika 1.0
+     * @see <a
+     *      href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
+     */
+    public static TikaConfig getDefaultConfig(Parser delegate)
+	    throws TikaException {
+	return getDefaultConfig();
+    }
+
+    private static DocumentBuilder getBuilder() throws TikaException {
+	try {
+	    return DocumentBuilderFactory.newInstance().newDocumentBuilder();
+	} catch (ParserConfigurationException e) {
+	    throw new TikaException("XML parser not available", e);
+	}
+    }
+
+    private static Element getChild(Element element, String name) {
+	Node child = element.getFirstChild();
+	while (child != null) {
+	    if (child.getNodeType() == Node.ELEMENT_NODE
+		    && name.equals(child.getNodeName())) {
+		return (Element) child;
+	    }
+	    child = child.getNextSibling();
+	}
+	return null;
+    }
+
+}
\ No newline at end of file

Added: nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Wed Jun 30 10:36:20 2010
@@ -0,0 +1,233 @@
+package org.apache.nutch.parse.tika;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+
+import org.apache.avro.util.Utf8;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.html.dom.HTMLDocumentImpl;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilters;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseStatusCodes;
+import org.apache.nutch.parse.ParseStatusUtils;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.storage.ParseStatus;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.storage.WebPage.Field;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.TableUtil;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.w3c.dom.DocumentFragment;
+
+/**
+ * Wrapper for Tika parsers. Mimics the HTMLParser but using the XHTML
+ * representation returned by Tika as SAX events
+ ***/
+
+public class TikaParser implements org.apache.nutch.parse.Parser {
+
+  public static final Log LOG = LogFactory.getLog(TikaParser.class);
+
+  private static Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+
+  static {
+    FIELDS.add(WebPage.Field.BASE_URL);
+    FIELDS.add(WebPage.Field.CONTENT_TYPE);
+  }
+
+  private Configuration conf;
+  private TikaConfig tikaConfig = null;
+  private DOMContentUtils utils;
+  private HtmlParseFilters htmlParseFilters;
+  private String cachingPolicy;
+
+  @Override
+  public Parse getParse(String url, WebPage page) {
+
+    String baseUrl = TableUtil.toString(page.getBaseUrl());
+    URL base;
+    try {
+      base = new URL(baseUrl);
+    } catch (MalformedURLException e) {
+      return ParseStatusUtils.getEmptyParse(e, getConf());
+    }
+
+    // get the right parser using the mime type as a clue
+    String mimeType = page.getContentType().toString();
+    Parser parser = tikaConfig.getParser(mimeType);
+    byte[] raw = page.getContent().array();
+
+    if (parser == null) {
+      String message = "Can't retrieve Tika parser for mime-type " + mimeType;
+      LOG.error(message);
+      return ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED_EXCEPTION,
+          message, getConf());
+    }
+
+    LOG.debug("Using Tika parser " + parser.getClass().getName() + " for mime-type "
+        + mimeType);
+
+    Metadata tikamd = new Metadata();
+
+    HTMLDocumentImpl doc = new HTMLDocumentImpl();
+    doc.setErrorChecking(false);
+    DocumentFragment root = doc.createDocumentFragment();
+    DOMBuilder domhandler = new DOMBuilder(doc, root);
+    ParseContext context = new ParseContext();
+    try {
+      parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd, context);
+    } catch (Exception e) {
+      LOG.error("Error parsing "+url,e);
+      return ParseStatusUtils.getEmptyParse(e, getConf());
+    }
+
+    HTMLMetaTags metaTags = new HTMLMetaTags();
+    String text = "";
+    String title = "";
+    Outlink[] outlinks = new Outlink[0];
+
+    // we have converted the sax events generated by Tika into a DOM object
+    // so we can now use the usual HTML resources from Nutch
+    // get meta directives
+    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
+    }
+
+    // check meta directives
+    if (!metaTags.getNoIndex()) { // okay to index
+      StringBuffer sb = new StringBuffer();
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Getting text...");
+      }
+      utils.getText(sb, root); // extract text
+      text = sb.toString();
+      sb.setLength(0);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Getting title...");
+      }
+      utils.getTitle(sb, root); // extract title
+      title = sb.toString().trim();
+    }
+
+    if (!metaTags.getNoFollow()) { // okay to follow links
+      ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
+      URL baseTag = utils.getBase(root);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Getting links...");
+      }
+      utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
+      outlinks = l.toArray(new Outlink[l.size()]);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("found " + outlinks.length + " outlinks in " + base);
+      }
+    }
+
+    // populate Nutch metadata with Tika metadata
+    String[] TikaMDNames = tikamd.names();
+    for (String tikaMDName : TikaMDNames) {
+      if (tikaMDName.equalsIgnoreCase(Metadata.TITLE)) continue;
+      // TODO what if multivalued?
+      page.putToMetadata(new Utf8(tikaMDName), ByteBuffer.wrap(Bytes.toBytes(tikamd
+          .get(tikaMDName))));
+    }
+
+    // no outlinks? try OutlinkExtractor e.g works for mime types where no
+    // explicit markup for anchors
+
+    if (outlinks.length == 0) {
+      outlinks = OutlinkExtractor.getOutlinks(text, getConf());
+    }
+
+    ParseStatus status = ParseStatusUtils.STATUS_SUCCESS;
+    if (metaTags.getRefresh()) {
+      status.setMinorCode(ParseStatusCodes.SUCCESS_REDIRECT);
+      status.addToArgs(new Utf8(metaTags.getRefreshHref().toString()));
+      status.addToArgs(new Utf8(Integer.toString(metaTags.getRefreshTime())));
+    }
+
+    Parse parse = new Parse(text, title, outlinks, status);
+    parse = htmlParseFilters.filter(url, page, parse, metaTags, root);
+
+    if (metaTags.getNoCache()) { // not okay to cache
+      page.putToMetadata(new Utf8(Nutch.CACHING_FORBIDDEN_KEY), ByteBuffer.wrap(Bytes
+          .toBytes(cachingPolicy)));
+    }
+
+    return parse;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.tikaConfig = null;
+
+    try {
+      tikaConfig = TikaConfig.getDefaultConfig();
+    } catch (Exception e2) {
+      String message = "Problem loading default Tika configuration";
+      LOG.error(message, e2);
+      throw new RuntimeException(e2);
+    }
+
+    this.htmlParseFilters = new HtmlParseFilters(getConf());
+    this.utils = new DOMContentUtils(conf);
+    this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
+        Nutch.CACHING_FORBIDDEN_CONTENT);
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  @Override
+  public Collection<Field> getFields() {
+    return FIELDS;
+  }
+
+  // main class used for debuggin
+  public static void main(String[] args) throws Exception {
+    String name = args[0];
+    String url = "file:" + name;
+    File file = new File(name);
+    byte[] bytes = new byte[(int) file.length()];
+    DataInputStream in = new DataInputStream(new FileInputStream(file));
+    in.readFully(bytes);
+    Configuration conf = NutchConfiguration.create();
+    // TikaParser parser = new TikaParser();
+    // parser.setConf(conf);
+    WebPage page = new WebPage();
+    page.setBaseUrl(new Utf8(url));
+    page.setContent(ByteBuffer.wrap(bytes));
+    MimeUtil mimeutil = new MimeUtil(conf);
+    MimeType mtype = mimeutil.getMimeType(file);
+    page.setContentType(new Utf8(mtype.getName()));
+    // Parse parse = parser.getParse(url, page);
+
+    Parse parse = new ParseUtil(conf).parse(url, page);
+
+    System.out.println("content type: " + mtype.getName());
+    System.out.println("title: " + parse.getTitle());
+    System.out.println("text: " + parse.getText());
+    System.out.println("outlinks: " + Arrays.toString(parse.getOutlinks()));
+  }
+}

Added: nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java Wed Jun 30 10:36:20 2010
@@ -0,0 +1,113 @@
+/*
+ * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
+ * XXX distribution, org.apache.xml.utils.XMLCharacterRecognizer,
+ * XXX in order to avoid dependency on Xalan.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Id: XMLCharacterRecognizer.java 823614 2009-10-09 17:02:32Z ab $
+ */
+package org.apache.nutch.parse.tika;
+
+/**
+ * Class used to verify whether the specified <var>ch</var> 
+ * conforms to the XML 1.0 definition of whitespace. 
+ */
+class XMLCharacterRecognizer
+{
+
+  /**
+   * Returns whether the specified <var>ch</var> conforms to the XML 1.0 definition
+   * of whitespace.  Refer to <A href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S">
+   * the definition of <CODE>S</CODE></A> for details.
+   * @param ch Character to check as XML whitespace.
+   * @return =true if <var>ch</var> is XML whitespace; otherwise =false.
+   */
+  static boolean isWhiteSpace(char ch)
+  {
+    return (ch == 0x20) || (ch == 0x09) || (ch == 0xD) || (ch == 0xA);
+  }
+
+  /**
+   * Tell if the string is whitespace.
+   *
+   * @param ch Character array to check as XML whitespace.
+   * @param start Start index of characters in the array
+   * @param length Number of characters in the array 
+   * @return True if the characters in the array are 
+   * XML whitespace; otherwise, false.
+   */
+  static boolean isWhiteSpace(char ch[], int start, int length)
+  {
+
+    int end = start + length;
+
+    for (int s = start; s < end; s++)
+    {
+      if (!isWhiteSpace(ch[s]))
+        return false;
+    }
+
+    return true;
+  }
+
+  /**
+   * Tell if the string is whitespace.
+   *
+   * @param buf StringBuffer to check as XML whitespace.
+   * @return True if characters in buffer are XML whitespace, false otherwise
+   */
+  static boolean isWhiteSpace(StringBuffer buf)
+  {
+
+    int n = buf.length();
+
+    for (int i = 0; i < n; i++)
+    {
+      if (!isWhiteSpace(buf.charAt(i)))
+        return false;
+    }
+
+    return true;
+  }
+  
+  /**
+   * Tell if the string is whitespace.
+   *
+   * @param s String to check as XML whitespace.
+   * @return True if characters in buffer are XML whitespace, false otherwise
+   */
+  static boolean isWhiteSpace(String s)
+  {
+
+    if(null != s)
+    {
+      int n = s.length();
+  
+      for (int i = 0; i < n; i++)
+      {
+        if (!isWhiteSpace(s.charAt(i)))
+          return false;
+      }
+    }
+
+    return true;
+  }
+
+}

Added: nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java Wed Jun 30 10:36:20 2010
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import junit.framework.TestCase;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.tika.mime.MimeType;
+
+/**
+ * Unit tests for MSWordParser.
+ * 
+ * @author John Xing
+ */
+public class TestMSWordParser extends TestCase {
+
+    private String fileSeparator = System.getProperty("file.separator");
+    // This system property is defined in ./src/plugin/build-plugin.xml
+    private String sampleDir = System.getProperty("test.data", ".");
+    // Make sure sample files are copied to "test.data" as specified in
+    // ./src/plugin/parse-msword/build.xml during plugin compilation.
+    // Check ./src/plugin/parse-msword/sample/README.txt for what they are.
+    private String[] sampleFiles = { "word97.doc" };
+
+    private String expectedText = "This is a sample doc file prepared for nutch.";
+
+    private Configuration conf;
+
+    public TestMSWordParser(String name) {
+	super(name);
+    }
+
+    protected void setUp() {
+	conf = NutchConfiguration.create();
+	conf.set("file.content.limit", "-1");
+    }
+
+    protected void tearDown() {
+    }
+
+    public String getTextContent(String fileName) throws ProtocolException,
+	    ParseException, IOException {
+	String urlString = sampleDir + fileSeparator + fileName;
+
+	File file = new File(urlString);
+	byte[] bytes = new byte[(int) file.length()];
+	DataInputStream in = new DataInputStream(new FileInputStream(file));
+	in.readFully(bytes);
+	in.close();
+	Parse parse;
+	WebPage page = new WebPage();
+	page.setBaseUrl(new Utf8("file:"+urlString));
+	page.setContent(ByteBuffer.wrap(bytes));
+	// set the content type?
+	MimeUtil mimeutil = new MimeUtil(conf);
+	MimeType mtype = mimeutil.getMimeType(file);
+	page.setContentType(new Utf8(mtype.getName()));
+		
+	parse = new ParseUtil(conf).parse("file:"+urlString, page);
+	return parse.getText();
+    }
+
+    public void testIt() throws ProtocolException, ParseException, IOException {
+	for (int i = 0; i < sampleFiles.length; i++) {
+	    String found = getTextContent(sampleFiles[i]);
+	    assertTrue("text found : '" + found + "'", found
+		    .startsWith(expectedText));
+	}
+    }
+
+    public void testOpeningDocs() throws ProtocolException, ParseException, IOException {
+	String[] filenames = new File(sampleDir).list();
+	for (int i = 0; i < filenames.length; i++) {
+	    if (filenames[i].endsWith(".doc") == false)
+		continue;
+	    assertTrue("cann't read content of " + filenames[i],
+		    getTextContent(filenames[i]).length() > 0);
+	}
+    }
+}

Added: nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java Wed Jun 30 10:36:20 2010
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.ByteBuffer;
+
+import junit.framework.TestCase;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.tika.mime.MimeType;
+
+/**
+ * Unit tests for OOParser.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class TestOOParser extends TestCase {
+
+    private String fileSeparator = System.getProperty("file.separator");
+    // This system property is defined in ./src/plugin/build-plugin.xml
+    private String sampleDir = System.getProperty("test.data", ".");
+    // Make sure sample files are copied to "test.data" as specified in
+    // ./src/plugin/parse-oo/build.xml during plugin compilation.
+    private String[] sampleFiles = { "ootest.odt", "ootest.sxw" };
+
+    private String sampleText = "ootest.txt";
+
+    private String expectedText;
+
+    public TestOOParser(String name) {
+	super(name);
+	try {
+	    // read the test string
+	    FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
+		    + sampleText);
+	    StringBuffer sb = new StringBuffer();
+	    int len = 0;
+	    InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
+	    char[] buf = new char[1024];
+	    while ((len = isr.read(buf)) > 0) {
+		sb.append(buf, 0, len);
+	    }
+	    isr.close();
+	    expectedText = sb.toString();
+	    // normalize space
+	    expectedText = expectedText.replaceAll("[ \t\r\n]+", " ");
+	} catch (Exception e) {
+	    e.printStackTrace();
+	}
+    }
+
+    protected void setUp() {
+    }
+
+    protected void tearDown() {
+    }
+
+    public void testIt() throws ProtocolException, ParseException, IOException {
+	String urlString;
+	Parse parse;
+	Configuration conf = NutchConfiguration.create();
+	MimeUtil mimeutil = new MimeUtil(conf);
+
+	System.out.println("Expected : " + expectedText);
+
+	for (int i = 0; i < sampleFiles.length; i++) {
+	    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+	    if (sampleFiles[i].startsWith("ootest") == false)
+		continue;
+
+	    File file = new File(sampleDir + fileSeparator + sampleFiles[i]);
+	    byte[] bytes = new byte[(int) file.length()];
+	    DataInputStream in = new DataInputStream(new FileInputStream(file));
+	    in.readFully(bytes);
+	    in.close();
+
+	    WebPage page = new WebPage();
+	    page.setBaseUrl(new Utf8(urlString));
+	    page.setContent(ByteBuffer.wrap(bytes));
+	    MimeType mtype = mimeutil.getMimeType(file);
+	    page.setContentType(new Utf8(mtype.getName()));
+
+	    parse = new ParseUtil(conf).parse(urlString, page);
+
+	    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
+
+	    // simply test for the presence of a text - the ordering of the
+	    // elements
+	    // may differ from what was expected
+	    // in the previous tests
+	    assertTrue(text != null && text.length() > 0);
+
+	    System.out.println("Found " + sampleFiles[i] + ": " + text);
+	}
+    }
+
+}

Added: nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java?rev=959259&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java Wed Jun 30 10:36:20 2010
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import junit.framework.TestCase;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.tika.mime.MimeType;
+
+/**
+ * Unit tests for PdfParser.
+ * 
+ * @author John Xing
+ */
+public class TestPdfParser extends TestCase {
+
+    private String fileSeparator = System.getProperty("file.separator");
+    // This system property is defined in ./src/plugin/build-plugin.xml
+    private String sampleDir = System.getProperty("test.data", ".");
+    // Make sure sample files are copied to "test.data" as specified in
+    // ./src/plugin/parse-pdf/build.xml during plugin compilation.
+    // Check ./src/plugin/parse-pdf/sample/README.txt for what they are.
+    private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" };
+
+    private String expectedText = "A VERY SMALL PDF FILE";
+
+    public TestPdfParser(String name) {
+	super(name);
+    }
+
+    protected void setUp() {
+    }
+
+    protected void tearDown() {
+    }
+
+    public void testIt() throws ProtocolException, ParseException, IOException {
+	String urlString;
+	Parse parse;
+	Configuration conf = NutchConfiguration.create();
+	MimeUtil mimeutil = new MimeUtil(conf);
+
+	for (int i = 0; i < sampleFiles.length; i++) {
+	    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+	    File file = new File(sampleDir + fileSeparator + sampleFiles[i]);
+	    byte[] bytes = new byte[(int) file.length()];
+	    DataInputStream in = new DataInputStream(new FileInputStream(file));
+	    in.readFully(bytes);
+	    in.close();
+
+	    WebPage page = new WebPage();
+	    page.setBaseUrl(new Utf8(urlString));
+	    page.setContent(ByteBuffer.wrap(bytes));
+	    MimeType mtype = mimeutil.getMimeType(file);
+	    page.setContentType(new Utf8(mtype.getName()));
+
+	    parse = new ParseUtil(conf).parse(urlString, page);
+
+	    int index = parse.getText().indexOf(expectedText);
+	    assertTrue(index > 0);
+	}
+    }
+
+}