You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/05/05 08:22:29 UTC

svn commit: r1677746 - in /tika/site: publish/1.9/ publish/1.9/examples.html publish/1.9/formats.html publish/plugin-management.html src/site/apt/1.9/formats.apt

Author: nick
Date: Tue May  5 06:22:29 2015
New Revision: 1677746

URL: http://svn.apache.org/r1677746
Log:
In-progress formats and examples pages for 1.9

Added:
    tika/site/publish/1.9/
    tika/site/publish/1.9/examples.html
    tika/site/publish/1.9/formats.html
Modified:
    tika/site/publish/plugin-management.html
    tika/site/src/site/apt/1.9/formats.apt

Added: tika/site/publish/1.9/examples.html
URL: http://svn.apache.org/viewvc/tika/site/publish/1.9/examples.html?rev=1677746&view=auto
==============================================================================
--- tika/site/publish/1.9/examples.html (added)
+++ tika/site/publish/1.9/examples.html Tue May  5 06:22:29 2015
@@ -0,0 +1,382 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+          "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+ 
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+
+
+
+
+
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+    <title>Apache Tika - Tika API Usage Examples</title>
+    <style type="text/css" media="all">
+      @import url("../css/site.css");
+    </style>
+    <link rel="icon" type="image/png" href="../tikaNoText16.png" />
+    <script type="text/javascript">
+      function selectProvider(form) {
+        provider = form.elements['searchProvider'].value;
+        if (provider == "any") {
+          if (Math.random() > 0.5) {
+            provider = "lucid";
+          } else {
+            provider = "sl";
+          }
+        }
+        if (provider == "lucid") {
+          form.action = "http://find.searchhub.org/p:tika";
+        } else if (provider == "sl") {
+          form.action = "http://search-lucene.com/tika";
+        }
+        days = 90;
+        date = new Date();
+        date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000));
+        expires = "; expires=" + date.toGMTString();
+        document.cookie = "searchProvider=" + provider + expires + "; path=/";
+      }
+      function initProvider() {
+        if (document.cookie.length>0) {
+          cStart=document.cookie.indexOf("searchProvider=");
+          if (cStart!=-1) {
+            cStart=cStart + "searchProvider=".length;
+            cEnd=document.cookie.indexOf(";", cStart);
+            if (cEnd==-1) {
+              cEnd=document.cookie.length;
+            }
+            provider = unescape(document.cookie.substring(cStart,cEnd));
+            document.forms['searchform'].elements['searchProvider'].value = provider;
+          }
+        }
+        document.forms['searchform'].elements['q'].focus();
+      }
+    </script>
+  </head>
+  <body onLoad="initProvider();">
+    <div id="body">
+      <div id="banner">
+        <a href="http://tika.apache.org" id="bannerLeft" title="Apache Tika"
+          ><img src="http://tika.apache.org/tika.png" alt="Apache Tika"
+                width="292" height="100"/></a>
+        <a href="http://www.apache.org/" id="bannerRight"
+           title="The Apache Software Foundation"
+          ><img src="http://tika.apache.org/asf-logo.gif" alt="The Apache Software Foundation"
+                width="387" height="100"/></a>
+      </div>
+      <div id="content">
+        <!-- Licensed to the Apache Software Foundation (ASF) under one or more --><!-- contributor license agreements.  See the NOTICE file distributed with --><!-- this work for additional information regarding copyright ownership. --><!-- The ASF licenses this file to You under the Apache License, Version 2.0 --><!-- (the "License"); you may not use this file except in compliance with --><!-- the License.  You may obtain a copy of the License at --><!--  --><!-- http://www.apache.org/licenses/LICENSE-2.0 --><!--  --><!-- Unless required by applicable law or agreed to in writing, software --><!-- distributed under the License is distributed on an "AS IS" BASIS, --><!-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. --><!-- See the License for the specific language governing permissions and --><!-- limitations under the License. --><div class="section">
+<h2>Apache Tika API Usage Examples<a name="Apache_Tika_API_Usage_Examples"></a></h2>
+<p>This page provides a number of examples on how to use the various Tika APIs. All of the examples shown are also available in the <a class="externalLink" href="https://svn.apache.org/repos/asf/tika/trunk/tika-example">Tika Example module</a> in SVN.</p>
+<ul>
+<li><a href="#Apache_Tika_API_Usage_Examples">Apache Tika API Usage Examples</a>
+<ul>
+<li><a href="#Parsing">Parsing</a>
+<ul>
+<li><a href="#Parsing_using_the_Tika_Facade">Parsing using the Tika Facade</a></li>
+<li><a href="#Parsing_using_the_Auto-Detect_Parser">Parsing using the Auto-Detect Parser</a></li></ul></li>
+<li><a href="#Picking_different_output_formats">Picking different output formats</a>
+<ul>
+<li><a href="#Parsing_to_Plain_Text">Parsing to Plain Text</a></li>
+<li><a href="#Parsing_to_XHTML">Parsing to XHTML</a></li>
+<li><a href="#Fetching_just_certain_bits_of_the_XHTML">Fetching just certain bits of the XHTML</a></li></ul></li>
+<li><a href="#Custom_Content_Handlers">Custom Content Handlers</a>
+<ul>
+<li><a href="#Extract_Phone_Numbers_from_Content_into_the_Metadata">Extract Phone Numbers from Content into the Metadata</a></li>
+<li><a href="#Streaming_the_plain_text_in_chunks">Streaming the plain text in chunks</a></li></ul></li>
+<li><a href="#Translation">Translation</a>
+<ul>
+<li><a href="#Translation_using_the_Microsoft_Translation_API">Translation using the Microsoft Translation API</a></li></ul></li>
+<li><a href="#Language_Identification">Language Identification</a></li>
+<li><a href="#Additional_Examples">Additional Examples</a></li></ul></li></ul>
+<div class="section">
+<h3><a name="Parsing">Parsing</a></h3>
+<p>Tika provides a number of different ways to parse a file. These provide different levels of control, flexibility, and complexity.</p>
+<div class="section">
+<h4><a name="Parsing_using_the_Tika_Facade">Parsing using the Tika Facade</a></h4>
+<p>The <a href="./apidocs/org/apache/tika/Tika.html">Tika facade</a>, provides a number of very quick and easy ways to have your content parsed by Tika, and return the resulting plain text</p><style type="text/css">
+   @import url('attached-includes/css/shCoreDefault.css');
+</style>
+<div id="highlighter_370145" class="syntaxhighlighter nogutter  java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number49 index0 alt2"><code class="java keyword">public</code> <code class="java plain">String parseToStringExample() </code><code class="java keyword">throws</code> <code class="java plain">IOException, SAXException, TikaException {</code></div><div class="line number50 index1 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">InputStream stream = ParsingExample.</code><code class="java keyword">class</code><code class="java plain">.getResourceAsStream(</code><code class="java string">"test.doc"</code><code class="java plain">);</code></div><div class="line number51 index2 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">Tika tika = </code><code class="java keyword">new</code> <code class="java plain">Tika();</code></div><
 div class="line number52 index3 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">try</code> <code class="java plain">{</code></div><div class="line number53 index4 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">return</code> <code class="java plain">tika.parseToString(stream);</code></div><div class="line number54 index5 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">} </code><code class="java keyword">finally</code> <code class="java plain">{</code></div><div class="line number55 index6 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">stream.close();</code></div><div class="line number56 index7 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">}</code></div><div class="line number57 index8 alt2"><code class="java plain">}</code></div></
 div></td></tr></tbody></table></div></div>
+<div class="section">
+<h4><a name="Parsing_using_the_Auto-Detect_Parser">Parsing using the Auto-Detect Parser</a></h4>
+<p>For more control, you can call the <a href="./apidocs/org/apache/tika/parser/Parser.html">Tika Parsers</a> directly. Most likely, you'll want to start out using the <a href="./apidocs/org/apache/tika/parser/AutoDetectParser.html">Auto-Detect Parser</a>, which automatically figures out what kind of content you have, then calls the appropriate parser for you.</p><div id="highlighter_820299" class="syntaxhighlighter nogutter  java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number83 index0 alt2"><code class="java keyword">public</code> <code class="java plain">String parseExample() </code><code class="java keyword">throws</code> <code class="java plain">IOException, SAXException, TikaException {</code></div><div class="line number84 index1 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">InputStream stream = ParsingExample.</code><code class="java keyword">class</code><c
 ode class="java plain">.getResourceAsStream(</code><code class="java string">"test.doc"</code><code class="java plain">);</code></div><div class="line number85 index2 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">AutoDetectParser parser = </code><code class="java keyword">new</code> <code class="java plain">AutoDetectParser();</code></div><div class="line number86 index3 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">BodyContentHandler handler = </code><code class="java keyword">new</code> <code class="java plain">BodyContentHandler();</code></div><div class="line number87 index4 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">Metadata metadata = </code><code class="java keyword">new</code> <code class="java plain">Metadata();</code></div><div class="line number88 index5 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">tr
 y</code> <code class="java plain">{</code></div><div class="line number89 index6 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">parser.parse(stream, handler, metadata);</code></div><div class="line number90 index7 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">return</code> <code class="java plain">handler.toString();</code></div><div class="line number91 index8 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">} </code><code class="java keyword">finally</code> <code class="java plain">{</code></div><div class="line number92 index9 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">stream.close();</code></div><div class="line number93 index10 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">}</code></div><div class="l
 ine number94 index11 alt1"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div></div>
+<div class="section">
+<h3><a name="Picking_different_output_formats">Picking different output formats</a></h3>
+<p>With Tika, you can get the textual content of your files returned in a number of different formats. These can be plain text, html, xhtml, xhtml of one part of the file etc. This is controlled based on the <a class="externalLink" href="http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html">ContentHandler</a> you supply to the Parser.</p>
+<div class="section">
+<h4><a name="Parsing_to_Plain_Text">Parsing to Plain Text</a></h4>
+<p>By using the <a href="./apidocs/org/apache/tika/sax/BodyContentHandler.html">BodyContentHandler</a>, you can request that Tika return only the content of the document's body as a plain-text string.</p><div id="highlighter_927957" class="syntaxhighlighter nogutter  java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number46 index0 alt1"><code class="java keyword">public</code> <code class="java plain">String parseToPlainText() </code><code class="java keyword">throws</code> <code class="java plain">IOException, SAXException, TikaException {</code></div><div class="line number47 index1 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">BodyContentHandler handler = </code><code class="java keyword">new</code> <code class="java plain">BodyContentHandler();</code></div><div class="line number48 index2 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code>&nbsp;</div><
 div class="line number49 index3 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">InputStream stream = ContentHandlerExample.</code><code class="java keyword">class</code><code class="java plain">.getResourceAsStream(</code><code class="java string">"test.doc"</code><code class="java plain">);</code></div><div class="line number50 index4 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">AutoDetectParser parser = </code><code class="java keyword">new</code> <code class="java plain">AutoDetectParser();</code></div><div class="line number51 index5 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">Metadata metadata = </code><code class="java keyword">new</code> <code class="java plain">Metadata();</code></div><div class="line number52 index6 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">try</code> <code class="java plain">{</code><
 /div><div class="line number53 index7 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">parser.parse(stream, handler, metadata);</code></div><div class="line number54 index8 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">return</code> <code class="java plain">handler.toString();</code></div><div class="line number55 index9 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">} </code><code class="java keyword">finally</code> <code class="java plain">{</code></div><div class="line number56 index10 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">stream.close();</code></div><div class="line number57 index11 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">}</code></div><div class="line number58 index12 alt1"><code class="ja
 va plain">}</code></div></div></td></tr></tbody></table></div></div>
+<div class="section">
+<h4><a name="Parsing_to_XHTML">Parsing to XHTML</a></h4>
+<p>By using the <a href="./apidocs/org/apache/tika/sax/ToXMLContentHandler.html">ToXMLContentHandler</a>, you can get the XHTML content of the whole document as a string.</p><div id="highlighter_545226" class="syntaxhighlighter nogutter  java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number63 index0 alt2"><code class="java keyword">public</code> <code class="java plain">String parseToHTML() </code><code class="java keyword">throws</code> <code class="java plain">IOException, SAXException, TikaException {</code></div><div class="line number64 index1 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">ContentHandler handler = </code><code class="java keyword">new</code> <code class="java plain">ToXMLContentHandler();</code></div><div class="line number65 index2 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code>&nbsp;</div><div class="line number66 index3 alt1">
 <code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">InputStream stream = ContentHandlerExample.</code><code class="java keyword">class</code><code class="java plain">.getResourceAsStream(</code><code class="java string">"test.doc"</code><code class="java plain">);</code></div><div class="line number67 index4 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">AutoDetectParser parser = </code><code class="java keyword">new</code> <code class="java plain">AutoDetectParser();</code></div><div class="line number68 index5 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">Metadata metadata = </code><code class="java keyword">new</code> <code class="java plain">Metadata();</code></div><div class="line number69 index6 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">try</code> <code class="java plain">{</code></div><div class="line number70 index7 
 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">parser.parse(stream, handler, metadata);</code></div><div class="line number71 index8 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">return</code> <code class="java plain">handler.toString();</code></div><div class="line number72 index9 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">} </code><code class="java keyword">finally</code> <code class="java plain">{</code></div><div class="line number73 index10 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">stream.close();</code></div><div class="line number74 index11 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">}</code></div><div class="line number75 index12 alt2"><code class="java plain">}</code></div></div></td></t
 r></tbody></table></div>
+<p>If you just want the body of the xhtml document, without the header, you can chain together a <a href="./apidocs/org/apache/tika/sax/BodyContentHandler.html">BodyContentHandler</a> and a <a href="./apidocs/org/apache/tika/sax/ToXMLContentHandler.html">ToXMLContentHandler</a> as shown:</p><div id="highlighter_23305" class="syntaxhighlighter nogutter  java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number81 index0 alt2"><code class="java keyword">public</code> <code class="java plain">String parseBodyToHTML() </code><code class="java keyword">throws</code> <code class="java plain">IOException, SAXException, TikaException {</code></div><div class="line number82 index1 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">ContentHandler handler = </code><code class="java keyword">new</code> <code class="java plain">BodyContentHandler(</code></div><div class="line number83 ind
 ex2 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">new</code> <code class="java plain">ToXMLContentHandler());</code></div><div class="line number84 index3 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code>&nbsp;</div><div class="line number85 index4 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">InputStream stream = ContentHandlerExample.</code><code class="java keyword">class</code><code class="java plain">.getResourceAsStream(</code><code class="java string">"test.doc"</code><code class="java plain">);</code></div><div class="line number86 index5 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">AutoDetectParser parser = </code><code class="java keyword">new</code> <code class="java plain">AutoDetectParser();</code></div><div class="line number87 index6 alt2"><code class="java spaces">&nbsp;&nbsp;&nb
 sp;&nbsp;</code><code class="java plain">Metadata metadata = </code><code class="java keyword">new</code> <code class="java plain">Metadata();</code></div><div class="line number88 index7 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">try</code> <code class="java plain">{</code></div><div class="line number89 index8 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">parser.parse(stream, handler, metadata);</code></div><div class="line number90 index9 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">return</code> <code class="java plain">handler.toString();</code></div><div class="line number91 index10 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">} </code><code class="java keyword">finally</code> <code class="java plain">{</code></div><div class="line number92 index11 alt1"><
 code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">stream.close();</code></div><div class="line number93 index12 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">}</code></div><div class="line number94 index13 alt1"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div>
+<div class="section">
+<h4><a name="Fetching_just_certain_bits_of_the_XHTML">Fetching just certain bits of the XHTML</a></h4>
+<p>It possible to execute XPath queries on the parse results, to fetch only certain bits of the XHTML. </p><div id="highlighter_818495" class="syntaxhighlighter nogutter  java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number100 index0 alt1"><code class="java keyword">public</code> <code class="java plain">String parseOnePartToHTML() </code><code class="java keyword">throws</code> <code class="java plain">IOException, SAXException, TikaException {</code></div><div class="line number101 index1 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java comments">// Only get things under html -> body -> div (class=header)</code></div><div class="line number102 index2 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">XPathParser xhtmlParser = </code><code class="java keyword">new</code> <code class="java plain">XPathParser(</code><code class="java strin
 g">"xhtml"</code><code class="java plain">, XHTMLContentHandler.XHTML);</code></div><div class="line number103 index3 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">Matcher divContentMatcher = xhtmlParser.parse(</code></div><div class="line number104 index4 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java string">"/xhtml:html/xhtml:body/xhtml:div/descendant::node()"</code><code class="java plain">);&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; </code></div><div class="line number105 index5 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">ContentHandler handler = </code><code class="java keyword">new</code> <code class="java plain">MatchingContentHandler(</code></div><div class="line number106 index6 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java ke
 yword">new</code> <code class="java plain">ToXMLContentHandler(), divContentMatcher);</code></div><div class="line number107 index7 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code>&nbsp;</div><div class="line number108 index8 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">InputStream stream = ContentHandlerExample.</code><code class="java keyword">class</code><code class="java plain">.getResourceAsStream(</code><code class="java string">"test2.doc"</code><code class="java plain">);</code></div><div class="line number109 index9 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">AutoDetectParser parser = </code><code class="java keyword">new</code> <code class="java plain">AutoDetectParser();</code></div><div class="line number110 index10 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">Metadata metadata = </code><code class="java keyword">new</code> <cod
 e class="java plain">Metadata();</code></div><div class="line number111 index11 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">try</code> <code class="java plain">{</code></div><div class="line number112 index12 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">parser.parse(stream, handler, metadata);</code></div><div class="line number113 index13 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">return</code> <code class="java plain">handler.toString();</code></div><div class="line number114 index14 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">} </code><code class="java keyword">finally</code> <code class="java plain">{</code></div><div class="line number115 index15 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plai
 n">stream.close();</code></div><div class="line number116 index16 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">}</code></div><div class="line number117 index17 alt2"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div></div>
+<div class="section">
+<h3><a name="Custom_Content_Handlers">Custom Content Handlers</a></h3>
+<p>The textual output of parsing a file with Tika is returned via the SAX <a class="externalLink" href="http://docs.oracle.com/javase/7/docs/api/org/xml/sax/ContentHandler.html">ContentHandler</a> you pass to the parse method. It is possible to customise your parsing by supplying your own ContentHandler which does special things.</p>
+<div class="section">
+<h4><a name="Extract_Phone_Numbers_from_Content_into_the_Metadata">Extract Phone Numbers from Content into the Metadata</a></h4>
+<p>By using the <a href="./apidocs/org/apache/tika/sax/PhoneExtractingContentHandler.html">PhoneExtractingContentHandler</a>, you can have any phone numbers found in the textual content of the document extracted and placed into the Metadata object for you.</p><div id="highlighter_210897" class="syntaxhighlighter nogutter  java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number69 index0 alt2"><code class="java keyword">public</code> <code class="java keyword">static</code> <code class="java keyword">void</code> <code class="java plain">process(File file) </code><code class="java keyword">throws</code> <code class="java plain">Exception {</code></div><div class="line number70 index1 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">Parser parser = </code><code class="java keyword">new</code> <code class="java plain">AutoDetectParser();</code></div><div class="line number71 
 index2 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">Metadata metadata = </code><code class="java keyword">new</code> <code class="java plain">Metadata();</code></div><div class="line number72 index3 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java comments">// The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them</code></div><div class="line number73 index4 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java comments">// to the underlying Handler.</code></div><div class="line number74 index5 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">PhoneExtractingContentHandler handler = </code><code class="java keyword">new</code> <code class="java plain">PhoneExtractingContentHandler(</code><code class="java keyword">new</code> <code class="java plain">BodyContentHandler(), metadata);</code></div><div cl
 ass="line number75 index6 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">InputStream stream = </code><code class="java keyword">new</code> <code class="java plain">FileInputStream(file);</code></div><div class="line number76 index7 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">try</code> <code class="java plain">{</code></div><div class="line number77 index8 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">parser.parse(stream, handler, metadata, </code><code class="java keyword">new</code> <code class="java plain">ParseContext());</code></div><div class="line number78 index9 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">}</code></div><div class="line number79 index10 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">finally</code> <code class="java plain">{
 </code></div><div class="line number80 index11 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">stream.close();</code></div><div class="line number81 index12 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">}</code></div><div class="line number82 index13 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">String[] numbers = metadata.getValues(</code><code class="java string">"phonenumbers"</code><code class="java plain">);</code></div><div class="line number83 index14 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">for</code> <code class="java plain">(String number : numbers) {</code></div><div class="line number84 index15 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">phoneNumbers.add(number);</code></div><div class="line number85 index16 al
 t2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">}</code></div><div class="line number86 index17 alt1"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div>
+<div class="section">
+<h4><a name="Streaming_the_plain_text_in_chunks">Streaming the plain text in chunks</a></h4>
+<p>Sometimes, you want to chunk the resulting text up, perhaps to output as you go minimising memory use, perhaps to output to HDFS files, or any other reason! With a small custom content handler, you can do that.</p><div id="highlighter_302486" class="syntaxhighlighter nogutter  java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number124 index0 alt1"><code class="java keyword">public</code> <code class="java plain">List&lt;String> parseToPlainTextChunks() </code><code class="java keyword">throws</code> <code class="java plain">IOException, SAXException, TikaException {</code></div><div class="line number125 index1 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">final</code> <code class="java plain">List&lt;String> chunks = </code><code class="java keyword">new</code> <code class="java plain">ArrayList&lt;String>();</code></div><div class="line number126 index2 alt1"><
 code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">chunks.add(</code><code class="java string">""</code><code class="java plain">);</code></div><div class="line number127 index3 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">ContentHandlerDecorator handler = </code><code class="java keyword">new</code> <code class="java plain">ContentHandlerDecorator() {</code></div><div class="line number128 index4 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java color1">@Override</code></div><div class="line number129 index5 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">public</code> <code class="java keyword">void</code> <code class="java plain">characters(</code><code class="java keyword">char</code><code class="java plain">[] ch, </code><code class="java keyword">int</code> <code class="java plain">star
 t, </code><code class="java keyword">int</code> <code class="java plain">length) {</code></div><div class="line number130 index6 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">String lastChunk = chunks.get(chunks.size()-</code><code class="java value">1</code><code class="java plain">);</code></div><div class="line number131 index7 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">String thisStr = </code><code class="java keyword">new</code> <code class="java plain">String(ch, start, length);</code></div><div class="line number132 index8 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code>&nbsp;</div><div class="line number133 index9 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code clas
 s="java keyword">if</code> <code class="java plain">(lastChunk.length()+length > MAXIMUM_TEXT_CHUNK_SIZE) {</code></div><div class="line number134 index10 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">chunks.add(thisStr);</code></div><div class="line number135 index11 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">} </code><code class="java keyword">else</code> <code class="java plain">{</code></div><div class="line number136 index12 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">chunks.set(chunks.size()-</code><code class="java value">1</code><code class="java plain">, lastChunk+thisStr);</code></div><div class="line number137 index13 alt2"><code class="java spaces">&nbsp
 ;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">}</code></div><div class="line number138 index14 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">}</code></div><div class="line number139 index15 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">};</code></div><div class="line number140 index16 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code>&nbsp;</div><div class="line number141 index17 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">InputStream stream = ContentHandlerExample.</code><code class="java keyword">class</code><code class="java plain">.getResourceAsStream(</code><code class="java string">"test2.doc"</code><code class="java plain">);</code></div><div class="line number142 index18 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">Aut
 oDetectParser parser = </code><code class="java keyword">new</code> <code class="java plain">AutoDetectParser();</code></div><div class="line number143 index19 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">Metadata metadata = </code><code class="java keyword">new</code> <code class="java plain">Metadata();</code></div><div class="line number144 index20 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">try</code> <code class="java plain">{</code></div><div class="line number145 index21 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">parser.parse(stream, handler, metadata);</code></div><div class="line number146 index22 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">return</code> <code class="java plain">chunks;</code></div><div class="line number147 index23 alt2"><code class
 ="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">} </code><code class="java keyword">finally</code> <code class="java plain">{</code></div><div class="line number148 index24 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">stream.close();</code></div><div class="line number149 index25 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">}</code></div><div class="line number150 index26 alt1"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div></div>
+<div class="section">
+<h3><a name="Translation">Translation</a></h3>
+<p>Tika provides a pluggable Translation system, which allow you to send the results of parsing off to an external system or program to have the text translated into another language.</p>
+<div class="section">
+<h4><a name="Translation_using_the_Microsoft_Translation_API">Translation using the Microsoft Translation API</a></h4>
+<p>In order to use the Microsoft Translation API, you need to sign up for a Microsoft account, get an API key, then pass the key to Tika before translating.</p><div id="highlighter_669150" class="syntaxhighlighter nogutter  java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number23 index0 alt2"><code class="java keyword">public</code> <code class="java plain">String microsoftTranslateToFrench(String text) {</code></div><div class="line number24 index1 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">MicrosoftTranslator translator = </code><code class="java keyword">new</code> <code class="java plain">MicrosoftTranslator();</code></div><div class="line number25 index2 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java comments">// Change the id and secret! See <a href="http://msdn.microsoft.com/en-us/library/hh454950.aspx.">http://msdn.microso
 ft.com/en-us/library/hh454950.aspx.</a></code></div><div class="line number26 index3 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">translator.setId(</code><code class="java string">"dummy-id"</code><code class="java plain">);</code></div><div class="line number27 index4 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">translator.setSecret(</code><code class="java string">"dummy-secret"</code><code class="java plain">);</code></div><div class="line number28 index5 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">try</code> <code class="java plain">{</code></div><div class="line number29 index6 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">return</code> <code class="java plain">translator.translate(text, </code><code class="java string">"fr"</code><code class="java plain">);</code></div><div class=
 "line number30 index7 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">} </code><code class="java keyword">catch</code> <code class="java plain">(Exception e) {</code></div><div class="line number31 index8 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">return</code> <code class="java string">"Error while translating."</code><code class="java plain">;</code></div><div class="line number32 index9 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">}</code></div><div class="line number33 index10 alt2"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div></div>
+<div class="section">
+<h3><a name="Language_Identification">Language Identification</a></h3>
+<p>Tika provides support for identifying the language of text, through the <a href="./apidocs/org/apache/tika/language/LanguageIdentifier.html">LanguageIdentifier</a> class.</p><div id="highlighter_894888" class="syntaxhighlighter nogutter  java"><table border="0" cellpadding="0" cellspacing="0"><tbody><tr><td class="code"><div class="container"><div class="line number23 index0 alt2"><code class="java keyword">public</code> <code class="java plain">String identifyLanguage(String text) {</code></div><div class="line number24 index1 alt1"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java plain">LanguageIdentifier identifier = </code><code class="java keyword">new</code> <code class="java plain">LanguageIdentifier(text);</code></div><div class="line number25 index2 alt2"><code class="java spaces">&nbsp;&nbsp;&nbsp;&nbsp;</code><code class="java keyword">return</code> <code class="java plain">identifier.getLanguage();</code></div><div class="line number26 index3
  alt1"><code class="java plain">}</code></div></div></td></tr></tbody></table></div></div>
+<div class="section">
+<h3><a name="Additional_Examples">Additional Examples</a></h3>
+<p>A number of other examples are also available, including all of the examples from the <a class="externalLink" href="http://manning.com/mattmann/">Tika In Action book</a>. These can all be found in the <a class="externalLink" href="https://svn.apache.org/repos/asf/tika/trunk/tika-example">Tika Example module</a> in SVN.</p></div></div>
+      </div>
+      <div id="sidebar">
+        <div id="navigation">
+                    <h5>Apache Tika</h5>
+            <ul>
+              
+    <li class="none">
+                    <a href="../index.html">Introduction</a>
+          </li>
+              
+    <li class="none">
+                    <a href="../download.html">Download</a>
+          </li>
+              
+    <li class="none">
+                    <a href="../contribute.html">Contribute</a>
+          </li>
+              
+    <li class="none">
+                    <a href="../mail-lists.html">Mailing Lists</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://wiki.apache.org/tika/" class="externalLink">Tika Wiki</a>
+          </li>
+              
+    <li class="none">
+                    <a href="https://issues.apache.org/jira/browse/TIKA" class="externalLink">Issue Tracker</a>
+          </li>
+          </ul>
+              <h5>Documentation</h5>
+            <ul>
+              
+          
+                    
+                  
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="expanded">
+                    <a href="../1.8/index.html">Apache Tika 1.8</a>
+                  <ul>
+                  
+    <li class="none">
+                    <a href="../1.8/gettingstarted.html">Getting Started</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.8/formats.html">Supported Formats</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.8/parser.html">Parser API</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.8/parser_guide.html">Parser 5min Quick Start Guide</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.8/detection.html">Content and Language Detection</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.8/examples.html">Usage Examples</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.8/api/">API Documentation</a>
+          </li>
+              </ul>
+        </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.7/index.html">Apache Tika 1.7</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.6/index.html">Apache Tika 1.6</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.5/index.html">Apache Tika 1.5</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.4/index.html">Apache Tika 1.4</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.3/index.html">Apache Tika 1.3</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.2/index.html">Apache Tika 1.2</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.1/index.html">Apache Tika 1.1</a>
+                </li>
+          </ul>
+              <h5>The Apache Software Foundation</h5>
+            <ul>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/foundation/" class="externalLink">About</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/licenses/" class="externalLink">License</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/security/" class="externalLink">Security</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/foundation/sponsorship.html" class="externalLink">Sponsorship</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/foundation/thanks.html" class="externalLink">Thanks</a>
+          </li>
+          </ul>
+      
+          <div id="search">
+            <h5>Search with Apache Solr</h5>
+            <form action="http://search.lucidimagination.com/p:tika"
+                  method="get" id="searchform">
+              <input type="text" id="query" name="q"/>
+              <select name="searchProvider" id="searchProvider">
+                <option value="any">provider</option>
+                <option value="lucid">Lucid Find</option>
+                <option value="sl">Search-Lucene</option>
+              </select>
+              <input type="submit" id="submit" value="Search" name="Search"
+                     onclick="selectProvider(this.form)"/>
+            </form>
+          </div>
+
+          <div id="bookpromo">
+            <h5>Books about Tika</h5>
+            <p>
+              <a href="http://manning.com/mattmann/" title="Tika in Action"
+                ><img src="../mattmann_cover150.jpg"
+                      width="150" height="186"/></a>
+            </p>
+          </div>
+        </div>
+      </div>
+      <div id="footer">
+        <p>
+          Copyright &#169; 2015
+          <a href="http://www.apache.org/">The Apache Software Foundation</a>.
+          Site powered by <a href="http://maven.apache.org/">Apache Maven</a>. 
+          Search powered by
+          <a href="http://www.lucidimagination.com">Lucid Imagination</a>
+          and <a href="http://sematext.com">Sematext</a>.
+          <br/>
+          Apache Tika, Tika, Apache, the Apache feather logo, and the Apache
+          Tika project logo are trademarks of The Apache Software Foundation.
+        </p>
+      </div>
+    </div>
+  </body>
+</html>

Added: tika/site/publish/1.9/formats.html
URL: http://svn.apache.org/viewvc/tika/site/publish/1.9/formats.html?rev=1677746&view=auto
==============================================================================
--- tika/site/publish/1.9/formats.html (added)
+++ tika/site/publish/1.9/formats.html Tue May  5 06:22:29 2015
@@ -0,0 +1,429 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+          "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+ 
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+
+
+
+
+
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+    <title>Apache Tika - Supported Document Formats</title>
+    <style type="text/css" media="all">
+      @import url("../css/site.css");
+    </style>
+    <link rel="icon" type="image/png" href="../tikaNoText16.png" />
+    <script type="text/javascript">
+      function selectProvider(form) {
+        provider = form.elements['searchProvider'].value;
+        if (provider == "any") {
+          if (Math.random() > 0.5) {
+            provider = "lucid";
+          } else {
+            provider = "sl";
+          }
+        }
+        if (provider == "lucid") {
+          form.action = "http://find.searchhub.org/p:tika";
+        } else if (provider == "sl") {
+          form.action = "http://search-lucene.com/tika";
+        }
+        days = 90;
+        date = new Date();
+        date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000));
+        expires = "; expires=" + date.toGMTString();
+        document.cookie = "searchProvider=" + provider + expires + "; path=/";
+      }
+      function initProvider() {
+        if (document.cookie.length>0) {
+          cStart=document.cookie.indexOf("searchProvider=");
+          if (cStart!=-1) {
+            cStart=cStart + "searchProvider=".length;
+            cEnd=document.cookie.indexOf(";", cStart);
+            if (cEnd==-1) {
+              cEnd=document.cookie.length;
+            }
+            provider = unescape(document.cookie.substring(cStart,cEnd));
+            document.forms['searchform'].elements['searchProvider'].value = provider;
+          }
+        }
+        document.forms['searchform'].elements['q'].focus();
+      }
+    </script>
+  </head>
+  <body onLoad="initProvider();">
+    <div id="body">
+      <div id="banner">
+        <a href="http://tika.apache.org" id="bannerLeft" title="Apache Tika"
+          ><img src="http://tika.apache.org/tika.png" alt="Apache Tika"
+                width="292" height="100"/></a>
+        <a href="http://www.apache.org/" id="bannerRight"
+           title="The Apache Software Foundation"
+          ><img src="http://tika.apache.org/asf-logo.gif" alt="The Apache Software Foundation"
+                width="387" height="100"/></a>
+      </div>
+      <div id="content">
+        <!-- Licensed to the Apache Software Foundation (ASF) under one or more --><!-- contributor license agreements.  See the NOTICE file distributed with --><!-- this work for additional information regarding copyright ownership. --><!-- The ASF licenses this file to You under the Apache License, Version 2.0 --><!-- (the "License"); you may not use this file except in compliance with --><!-- the License.  You may obtain a copy of the License at --><!--  --><!-- http://www.apache.org/licenses/LICENSE-2.0 --><!--  --><!-- Unless required by applicable law or agreed to in writing, software --><!-- distributed under the License is distributed on an "AS IS" BASIS, --><!-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. --><!-- See the License for the specific language governing permissions and --><!-- limitations under the License. --><div class="section">
+<h2>Supported Document Formats<a name="Supported_Document_Formats"></a></h2>
+<p>This page lists all the document formats supported by Apache Tika 1.9. Follow the links to the various parser class javadocs for more detailed information about each document format and how it is parsed by Tika.</p>
+<ul>
+<li><a href="#Supported_Document_Formats">Supported Document Formats</a>
+<ul>
+<li><a href="#HyperText_Markup_Language">HyperText Markup Language</a></li>
+<li><a href="#XML_and_derived_formats">XML and derived formats</a></li>
+<li><a href="#Microsoft_Office_document_formats">Microsoft Office document formats</a></li>
+<li><a href="#OpenDocument_Format">OpenDocument Format</a></li>
+<li><a href="#iWorks_document_formats">iWorks document formats</a></li>
+<li><a href="#Portable_Document_Format">Portable Document Format</a></li>
+<li><a href="#Electronic_Publication_Format">Electronic Publication Format</a></li>
+<li><a href="#Rich_Text_Format">Rich Text Format</a></li>
+<li><a href="#Compression_and_packaging_formats">Compression and packaging formats</a></li>
+<li><a href="#Text_formats">Text formats</a></li>
+<li><a href="#Feed_and_Syndication_formats">Feed and Syndication formats</a></li>
+<li><a href="#Help_formats">Help formats</a></li>
+<li><a href="#Audio_formats">Audio formats</a></li>
+<li><a href="#Image_formats">Image formats</a></li>
+<li><a href="#Video_formats">Video formats</a></li>
+<li><a href="#Java_class_files_and_archives">Java class files and archives</a></li>
+<li><a href="#Source_code">Source code</a></li>
+<li><a href="#Mail_formats">Mail formats</a></li>
+<li><a href="#CAD_formats">CAD formats</a></li>
+<li><a href="#Font_formats">Font formats</a></li>
+<li><a href="#Scientific_formats">Scientific formats</a></li>
+<li><a href="#Executable_programs_and_libraries">Executable programs and libraries</a></li>
+<li><a href="#Crypto_formats">Crypto formats</a></li></ul></li></ul>
+<div class="section">
+<h3><a name="HyperText_Markup_Language">HyperText Markup Language</a></h3>
+<p>The HyperText Markup Language (HTML) is the lingua franca of the web. Tika uses the <a class="externalLink" href="http://home.ccil.org/~cowan/XML/tagsoup/">TagSoup</a> library to support virtually any kind of HTML found on the web. The output from the <a href="./api/org/apache/tika/parser/html/HtmlParser.html">HtmlParser</a> class is guaranteed to be well-formed and valid XHTML, and various heuristics are used to prevent things like inline scripts from cluttering the extracted text content.</p></div>
+<div class="section">
+<h3><a name="XML_and_derived_formats">XML and derived formats</a></h3>
+<p>The Extensible Markup Language (XML) format is a generic format that can be used for all kinds of content. Tika has custom parsers for some widely used XML vocabularies like XHTML, OOXML and ODF, but the default <a href="./api/org/apache/tika/parser/xml/DcXMLParser.html">DcXMLParser</a> class simply extracts the text content of the document and ignores any XML structure. The only exception to this rule are Dublin Core metadata elements that are used for the document metadata.</p></div>
+<div class="section">
+<h3><a name="Microsoft_Office_document_formats">Microsoft Office document formats</a></h3>
+<p>Microsoft Office and some related applications produce documents in the generic OLE 2 Compound Document and Office Open XML (OOXML) formats. The older OLE 2 format was introduced in Microsoft Office version 97 and was the default format until Office version 2007 and the new XML-based OOXML format. The <a href="./api/org/apache/tika/parser/microsoft/OfficeParser.html">OfficeParser</a> and <a href="./api/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.html">OOXMLParser</a> classes use <a class="externalLink" href="http://poi.apache.org/">Apache POI</a> libraries to support text and metadata extraction from both OLE2 and OOXML documents.</p>
+<p>Old, pre-OLE2 Excel files (Excel 2, 3 and 4) are handled by the <a href="./api/org/apache/tika/parser/microsoft/OldExcelParser.html">OldExcelParser</a>.</p></div>
+<div class="section">
+<h3><a name="OpenDocument_Format">OpenDocument Format</a></h3>
+<p>The OpenDocument format (ODF) is used most notably as the default format of the OpenOffice.org office suite. The <a href="./api/org/apache/tika/parser/odf/OpenDocumentParser.html">OpenDocumentParser</a> class supports this format and the earlier OpenOffice 1.0 format on which ODF is based.</p></div>
+<div class="section">
+<h3><a name="iWorks_document_formats">iWorks document formats</a></h3>
+<p>The various iWorks document formats (Numbers, Pages, Keynote) are supported by the <a href="./api/org/apache/tika/parser/iwork/IWorkPackageParser.html">IWorkPackageParser</a> class, which extracts text and metadata.</p></div>
+<div class="section">
+<h3><a name="Portable_Document_Format">Portable Document Format</a></h3>
+<p>The <a href="./api/org/apache/tika/parser/pdf/PDFParser.html">PDFParser</a> class parsers Portable Document Format (PDF) documents using the <a class="externalLink" href="http://pdfbox.apache.org/">Apache PDFBox</a> library.</p></div>
+<div class="section">
+<h3><a name="Electronic_Publication_Format">Electronic Publication Format</a></h3>
+<p>The <a href="./api/org/apache/tika/parser/epub/EpubParser.html">EpubParser</a> class supports the Electronic Publication Format (EPUB) used for many digital books.</p>
+<p>The <a href="./api/org/apache/tika/parser/xml/FictionBookParser.html">FictionBookParser</a> class supports the xml-based Fiction Book publishing format.</p></div>
+<div class="section">
+<h3><a name="Rich_Text_Format">Rich Text Format</a></h3>
+<p>The <a href="./api/org/apache/tika/parser/rtf/RTFParser.html">RTFParser</a> class uses the standard javax.swing.text.rtf feature to extract text content from Rich Text Format (RTF) documents.</p></div>
+<div class="section">
+<h3><a name="Compression_and_packaging_formats">Compression and packaging formats</a></h3>
+<p>Tika uses the <a class="externalLink" href="http://commons.apache.org/compress/">Commons Compress</a> library to support various compression and packaging formats. The <a href="./api/org/apache/tika/parser/pkg/CompressorParser.html">CompressorParser</a> class handles parsing of the top level compression formats, then <a href="./api/org/apache/tika/parser/pkg/PackageParser.html">PackageParser</a> class and its subclasses parse the packaging formats and then pass the unpacked document streams to a second parsing stage using the parser instance specified in the parse context. Formats supported include Tar, RAR, AR, CPIO, Zip, 7Zip, Gzip, BZip2, XZ and Pack200.</p></div>
+<div class="section">
+<h3><a name="Text_formats">Text formats</a></h3>
+<p>Extracting text content from plain text files seems like a simple task until you start thinking of all the possible character encodings. The <a href="./api/org/apache/tika/parser/txt/TXTParser.html">TXTParser</a> class uses encoding detection code from the <a class="externalLink" href="http://site.icu-project.org/">ICU</a> project to automatically detect the character encoding of a text document.</p></div>
+<div class="section">
+<h3><a name="Feed_and_Syndication_formats">Feed and Syndication formats</a></h3>
+<p>The <a href="./api/org/apache/tika/parser/feed/FeedParser.html">FeedParser</a> class supports the RSS and Atom feed syndication formats.</p>
+<p>The <a href="./api/org/apache/tika/parser/iptc/IptcAnpaParser.html">IptcAnpaParser</a> class supports the IPTC ANPA News Wire feed format.</p></div>
+<div class="section">
+<h3><a name="Help_formats">Help formats</a></h3>
+<p>The <a href="./api/org/apache/tika/parser/chm/ChmParser.html">ChmParser</a> class supports the CHM Help format.</p></div>
+<div class="section">
+<h3><a name="Audio_formats">Audio formats</a></h3>
+<p>Tika can detect several common audio formats and extract metadata from them. Even text extraction is supported for some audio files that contain lyrics or other textual content. Extracted metadata includes sampling rates, channels, format information, artists, titles etc. The <a href="./api/org/apache/tika/parser/audio/AudioParser.html">AudioParser</a> and <a href="./api/org/apache/tika/parser/audio/MidiParser.html">MidiParser</a> classes use standard javax.sound features to process simple audio formats. The <a href="./api/org/apache/tika/parser/mp3/Mp3Parser.html">Mp3Parser</a> class adds support for the widely used MP3 format, and the <a href="./api/org/apache/tika/parser/mp4/MP4Parser.html">MP4Parser</a> class provides it for MP4 audio. The Ogg family of audio formats (Vorbis, Speex, Opus, Flac etc) are supported by the <a href="./api/org/gagravarr/tika/VorbisParser.html">VorbisParser</a>, <a href="./api/org/gagravarr/tika/OpusParser.html">OpusParser</a>, <a href="./api/org/ga
 gravarr/tika/SpeexParser.html">SpeexParser</a> and <a href="./api/org/gagravarr/tika/FlacParser.html">FlacParser</a> classes.</p></div>
+<div class="section">
+<h3><a name="Image_formats">Image formats</a></h3>
+<p>The <a href="./api/org/apache/tika/parser/image/ImageParser.html">ImageParser</a> class uses the standard javax.imageio feature to extract simple metadata from image formats supported by the Java platform, such as PNG, GIF and BMP. More complex image metadata is available through the <a href="./api/org/apache/tika/parser/jpeg/JpegParser.html">JpegParser</a> class and <a href="./api/org/apache/tika/parser/image/TiffParser.html">TiffParser</a> classes that uses the metadata-extractor library to supports Exif metadata extraction from Jpeg and Tiff images. The <a href="./api/org/apache/tika/parser/image/PSDParser.html">PSDParser</a> class extracts metadata from PSD images. The <a href="./api/org/apache/tika/parser/image/BPGParser.html">BPGParser</a> class extracts simple metadata from BPG (Better Portable Graphics) images. The <a href="./api/org/apache/tika/parser/image/WebPParser.html">WebPParser</a> class extracts simple metadata from WebP image format.</p>
+<p>When extracting from images, it is also possible to chain in Tesseract via the <a href="./api/org/apache/tika/parser/ocr/TesseractOCRParser.html">TesseractOCRParser</a> to have OCR performed on the contents of the image.</p></div>
+<div class="section">
+<h3><a name="Video_formats">Video formats</a></h3>
+<p>Tika supports the Flash video format using a simple parsing algorithm implemented in the <a href="./api/org/apache/tika/parser/video/FLVParser">FLVParser</a> class.</p>
+<p>The MP4 family of video formats (MP4, Quicktime, 3GPP etc) is supported by the <a href="./api/org/apache/tika/parser/mp4/MP4Parser">MP4Parser</a> class, which extracts metadata on the video, along with audio stream (if present).</p>
+<p>For the Ogg family of video formats, a limited amount of metadata is extracted by the <a href="./api/org/gagravarr/tika/OggParser.html">OggParser</a> class.</p></div>
+<div class="section">
+<h3><a name="Java_class_files_and_archives">Java class files and archives</a></h3>
+<p>The <a href="./api/org/apache/tika/parser/asm/ClassParser">ClassParser</a> class extracts class names and method signatures from Java class files, and the <a href="./api/org/apache/tika/parser/pkg/ZipParser.html">ZipParser</a> class supports also jar archives.</p></div>
+<div class="section">
+<h3><a name="Source_code">Source code</a></h3>
+<p>The <a href="./api/org/apache/tika/parser/code/SourceCodeParser">SourceCodeParser</a> class handles a number of source code formats, including Java, C, C++ and Groovy. It provides a formatted form of the code, along with some simple metadata.</p></div>
+<div class="section">
+<h3><a name="Mail_formats">Mail formats</a></h3>
+<p>The <a href="./api/org/apache/tika/parser/mbox/MboxParser.html">MboxParser</a> can extract email messages from the mbox format used by many email archives and Unix-style mailboxes.</p>
+<p>The <a href="./api/org/apache/tika/parser/mail/RFC822Parser.html">RFC822Parser</a> can process single email messages in the RFC 822 format used by many email clients in their archives / exports.</p>
+<p>The <a href="./api/org/apache/tika/parser/mbox/OutlookPSTParser.html">OutlookPSTParser</a> can extract email messages from the Microsoft Outlook PST email format.</p>
+<p>The <a href="./api/org/apache/tika/parser/microsoft/TNEFParser.html">TNEFParser</a> can extract email attachments from the Microsoft TNEF (Transport Neutral Encoding Format, aka Winmail.dat) used with some Microsoft email clients.</p></div>
+<div class="section">
+<h3><a name="CAD_formats">CAD formats</a></h3>
+<p>The <a href="./api/org/apache/tika/parser/dwg/DWGParser.html">DWGParser</a> can extract simple metadata from the DWG CAD format.</p></div>
+<div class="section">
+<h3><a name="Font_formats">Font formats</a></h3>
+<p>The <a href="./api/org/apache/tika/parser/font/TrueTypeParser.html">TrueTypeParser</a> class can extract simple metadata from the TrueType font format. The <a href="./api/org/apache/tika/parser/font/AdobeFontMetricParser.html">AdobeFontMetricParser</a> class does something similar for Adobe Font Metrics files.</p></div>
+<div class="section">
+<h3><a name="Scientific_formats">Scientific formats</a></h3>
+<p>The <a href="./api/org/apache/tika/parser/dif/DIFParser.html">DIFParser</a> is able to extract attribute metadata from the GCMD Directory Interchange Format (DIF) scientific file format.</p>
+<p>The <a href="./api/org/apache/tika/parser/gdal/GDALParser.html">GDALParser</a> is able to extract attribute metadata from the GDAL scientific file format.</p>
+<p>The <a href="./api/org/apache/tika/parser/geoinfo/GeographicInformationParser.html">GeographicInformationParser</a> is able to extract attribute metadata from the ISO-19139 georgraphic information file format.</p>
+<p>The <a href="./api/org/apache/tika/parser/grib/GribParser.html">GribParser</a> is able to extract attribute metadata from the Grib scientific file format.</p>
+<p>The <a href="./api/org/apache/tika/parser/hdf/HDFParser.html">HDFParser</a> is able to extract attribute metadata from the HDF scientific file format.</p>
+<p>The <a href="./api/org/apache/tika/parser/isatab/ISArchiveParser.html">ISArchiveParser</a> is able to extract attribute metadata from the ISA-Tab (ISA Tools) family of scientific file formats.</p>
+<p>The <a href="./api/org/apache/tika/parser/netcdf/NetCDFParser.html">NetCDFParser</a> is able to extract attribute metadata from the NetCDF scientific file format.</p>
+<p>The <a href="./api/org/apache/tika/parser/mat/MatParser.html">MatParser</a> is able to extract attribute metadata from the Matlab scientific file format.</p></div>
+<div class="section">
+<h3><a name="Executable_programs_and_libraries">Executable programs and libraries</a></h3>
+<p>The <a href="./api/org/apache/tika/parser/executable/ExecutableParser.html">ExecutableParser</a> can extract metadata information on platforms, architectures and types from a range of executable formats and libraries, such as Windows Executables and Linux / BSD programs and libraries.</p></div>
+<div class="section">
+<h3><a name="Crypto_formats">Crypto formats</a></h3>
+<p>The <a href="./api/org/apache/tika/parser/crypto/Pkcs7Parser.html">Pkcs7Parser</a> is able to parse the contents of PKCS7 signed messages, but doesn't include any information from the outer PKCS7 wrapper.</p></div></div>
+<div class="section">
+<h2>Full list of supported formats:<a name="Full_list_of_supported_formats:"></a></h2>
+<p>TODO Populate this at release time</p></div>
+      </div>
+      <div id="sidebar">
+        <div id="navigation">
+                    <h5>Apache Tika</h5>
+            <ul>
+              
+    <li class="none">
+                    <a href="../index.html">Introduction</a>
+          </li>
+              
+    <li class="none">
+                    <a href="../download.html">Download</a>
+          </li>
+              
+    <li class="none">
+                    <a href="../contribute.html">Contribute</a>
+          </li>
+              
+    <li class="none">
+                    <a href="../mail-lists.html">Mailing Lists</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://wiki.apache.org/tika/" class="externalLink">Tika Wiki</a>
+          </li>
+              
+    <li class="none">
+                    <a href="https://issues.apache.org/jira/browse/TIKA" class="externalLink">Issue Tracker</a>
+          </li>
+          </ul>
+              <h5>Documentation</h5>
+            <ul>
+              
+          
+                    
+                  
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="expanded">
+                    <a href="../1.8/index.html">Apache Tika 1.8</a>
+                  <ul>
+                  
+    <li class="none">
+                    <a href="../1.8/gettingstarted.html">Getting Started</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.8/formats.html">Supported Formats</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.8/parser.html">Parser API</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.8/parser_guide.html">Parser 5min Quick Start Guide</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.8/detection.html">Content and Language Detection</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.8/examples.html">Usage Examples</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../1.8/api/">API Documentation</a>
+          </li>
+              </ul>
+        </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.7/index.html">Apache Tika 1.7</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.6/index.html">Apache Tika 1.6</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.5/index.html">Apache Tika 1.5</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.4/index.html">Apache Tika 1.4</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.3/index.html">Apache Tika 1.3</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.2/index.html">Apache Tika 1.2</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../1.1/index.html">Apache Tika 1.1</a>
+                </li>
+          </ul>
+              <h5>The Apache Software Foundation</h5>
+            <ul>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/foundation/" class="externalLink">About</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/licenses/" class="externalLink">License</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/security/" class="externalLink">Security</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/foundation/sponsorship.html" class="externalLink">Sponsorship</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/foundation/thanks.html" class="externalLink">Thanks</a>
+          </li>
+          </ul>
+      
+          <div id="search">
+            <h5>Search with Apache Solr</h5>
+            <form action="http://search.lucidimagination.com/p:tika"
+                  method="get" id="searchform">
+              <input type="text" id="query" name="q"/>
+              <select name="searchProvider" id="searchProvider">
+                <option value="any">provider</option>
+                <option value="lucid">Lucid Find</option>
+                <option value="sl">Search-Lucene</option>
+              </select>
+              <input type="submit" id="submit" value="Search" name="Search"
+                     onclick="selectProvider(this.form)"/>
+            </form>
+          </div>
+
+          <div id="bookpromo">
+            <h5>Books about Tika</h5>
+            <p>
+              <a href="http://manning.com/mattmann/" title="Tika in Action"
+                ><img src="../mattmann_cover150.jpg"
+                      width="150" height="186"/></a>
+            </p>
+          </div>
+        </div>
+      </div>
+      <div id="footer">
+        <p>
+          Copyright &#169; 2015
+          <a href="http://www.apache.org/">The Apache Software Foundation</a>.
+          Site powered by <a href="http://maven.apache.org/">Apache Maven</a>. 
+          Search powered by
+          <a href="http://www.lucidimagination.com">Lucid Imagination</a>
+          and <a href="http://sematext.com">Sematext</a>.
+          <br/>
+          Apache Tika, Tika, Apache, the Apache feather logo, and the Apache
+          Tika project logo are trademarks of The Apache Software Foundation.
+        </p>
+      </div>
+    </div>
+  </body>
+</html>

Modified: tika/site/publish/plugin-management.html
URL: http://svn.apache.org/viewvc/tika/site/publish/plugin-management.html?rev=1677746&r1=1677745&r2=1677746&view=diff
==============================================================================
--- tika/site/publish/plugin-management.html (original)
+++ tika/site/publish/plugin-management.html Tue May  5 06:22:29 2015
@@ -110,7 +110,7 @@
 <tr class="b">
 <td>org.apache.maven.plugins</td>
 <td><a class="externalLink" href="http://maven.apache.org/plugins/maven-dependency-plugin/">maven-dependency-plugin</a></td>
-<td>2.1</td></tr>
+<td>2.8</td></tr>
 <tr class="a">
 <td>org.apache.maven.plugins</td>
 <td><a class="externalLink" href="http://maven.apache.org/plugins/maven-deploy-plugin/">maven-deploy-plugin</a></td>

Modified: tika/site/src/site/apt/1.9/formats.apt
URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/1.9/formats.apt?rev=1677746&r1=1677745&r2=1677746&view=diff
==============================================================================
--- tika/site/src/site/apt/1.9/formats.apt (original)
+++ tika/site/src/site/apt/1.9/formats.apt Tue May  5 06:22:29 2015
@@ -214,7 +214,7 @@ Supported Document Formats
    The {{{./api/org/apache/tika/parser/mbox/OutlookPSTParser.html}OutlookPSTParser}} can
    extract email messages from the Microsoft Outlook PST email format.
 
-   The {{{./api/org/apache/tika/parser/microsoft/TNEFParser.html}TNEFParser} can
+   The {{{./api/org/apache/tika/parser/microsoft/TNEFParser.html}TNEFParser}} can
    extract email attachments from the Microsoft TNEF (Transport Neutral Encoding
    Format, aka Winmail.dat) used with some Microsoft email clients.
 
@@ -249,7 +249,7 @@ Supported Document Formats
    The {{{./api/org/apache/tika/parser/hdf/HDFParser.html}HDFParser}}
    is able to extract attribute metadata from the HDF scientific file format.
 
-   The {{{./api/org/apache/tika/parser/isatab/ISArchiveParser.html}ISArchiveParser}
+   The {{{./api/org/apache/tika/parser/isatab/ISArchiveParser.html}ISArchiveParser}}
    is able to extract attribute metadata from the ISA-Tab (ISA Tools) family of
    scientific file formats.