You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/10/06 22:43:19 UTC

svn commit: r1179842 - in /tika/site: publish/1.0/ publish/1.0/parser_guide.html publish/team-list.html src/site/apt/1.0/ src/site/apt/1.0/parser_guide.apt

Author: nick
Date: Thu Oct  6 20:43:18 2011
New Revision: 1179842

URL: http://svn.apache.org/viewvc?rev=1179842&view=rev
Log:
Create the initial 1.0 parser guide document, based on the simplifications to custom mimetypes offered TIKA-746, along with suggesting people extend AbstractParser. (Document not linked)

Added:
    tika/site/publish/1.0/
    tika/site/publish/1.0/parser_guide.html
    tika/site/src/site/apt/1.0/
    tika/site/src/site/apt/1.0/parser_guide.apt   (with props)
Modified:
    tika/site/publish/team-list.html

Added: tika/site/publish/1.0/parser_guide.html
URL: http://svn.apache.org/viewvc/tika/site/publish/1.0/parser_guide.html?rev=1179842&view=auto
==============================================================================
--- tika/site/publish/1.0/parser_guide.html (added)
+++ tika/site/publish/1.0/parser_guide.html Thu Oct  6 20:43:18 2011
@@ -0,0 +1,335 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+          "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+ 
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+
+
+
+
+
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+    <title>Apache Tika - Get Tika parsing up and running in 5 minutes</title>
+    <style type="text/css" media="all">
+      @import url("../css/site.css");
+    </style>
+    <link rel="icon" type="image/png" href="../tikaNoText16.png" />
+    <script type="text/javascript">
+      function selectProvider(form) {
+        provider = form.elements['searchProvider'].value;
+        if (provider == "any") {
+          if (Math.random() > 0.5) {
+            provider = "lucid";
+          } else {
+            provider = "sl";
+          }
+        }
+        if (provider == "lucid") {
+          form.action = "http://search.lucidimagination.com/p:tika";
+        } else if (provider == "sl") {
+          form.action = "http://search-lucene.com/tika";
+        }
+        days = 90;
+        date = new Date();
+        date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000));
+        expires = "; expires=" + date.toGMTString();
+        document.cookie = "searchProvider=" + provider + expires + "; path=/";
+      }
+      function initProvider() {
+        if (document.cookie.length>0) {
+          cStart=document.cookie.indexOf("searchProvider=");
+          if (cStart!=-1) {
+            cStart=cStart + "searchProvider=".length;
+            cEnd=document.cookie.indexOf(";", cStart);
+            if (cEnd==-1) {
+              cEnd=document.cookie.length;
+            }
+            provider = unescape(document.cookie.substring(cStart,cEnd));
+            document.forms['searchform'].elements['searchProvider'].value = provider;
+          }
+        }
+        document.forms['searchform'].elements['q'].focus();
+      }
+    </script>
+  </head>
+  <body onLoad="initProvider();">
+    <div id="body">
+      <div id="banner">
+        <a href="http://tika.apache.org" id="bannerLeft" title="Apache Tika"
+          ><img src="http://tika.apache.org/tika.png" alt="Apache Tika"
+                width="292" height="100"/></a>
+        <a href="http://www.apache.org/" id="bannerRight"
+           title="The Apache Software Foundation"
+          ><img src="http://tika.apache.org/asf-logo.gif" alt="The Apache Software Foundation"
+                width="387" height="100"/></a>
+      </div>
+      <div id="content">
+        <!-- Licensed to the Apache Software Foundation (ASF) under one or more --><!-- contributor license agreements.  See the NOTICE file distributed with --><!-- this work for additional information regarding copyright ownership. --><!-- The ASF licenses this file to You under the Apache License, Version 2.0 --><!-- (the "License"); you may not use this file except in compliance with --><!-- the License.  You may obtain a copy of the License at --><!--  --><!-- http://www.apache.org/licenses/LICENSE-2.0 --><!--  --><!-- Unless required by applicable law or agreed to in writing, software --><!-- distributed under the License is distributed on an "AS IS" BASIS, --><!-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. --><!-- See the License for the specific language governing permissions and --><!-- limitations under the License. --><div class="section"><h2>Get Tika parsing up and running in 5 minutes<a name="Get_Tika_parsing_up_and_running_in_5_min
 utes"></a></h2><p>This page is a quick start guide showing how to add a new parser to Apache Tika. Following the simple steps listed below your new parser can be running in only 5 minutes.</p><ul><li><a href="#Get_Tika_parsing_up_and_running_in_5_minutes">Get Tika parsing up and running in 5 minutes</a><ul><li><a href="#Getting_Started">Getting Started</a></li><li><a href="#Add_your_MIME-Type">Add your MIME-Type</a></li><li><a href="#Create_your_Parser_class">Create your Parser class</a></li><li><a href="#List_the_new_parser">List the new parser</a></li></ul></li></ul><div class="section"><h3><a name="Getting_Started">Getting Started</a><a name="Getting_Started"></a></h3><p>The <a href="#gettingstarted.html">Getting Started</a> document describes how to build Apache Tika from sources and how to start using Tika in an application. Pay close attention and follow the instructions in the &quot;Getting and building the sources&quot; section.</p></div><div class="section"><h3><a n
 ame="Add_your_MIME-Type">Add your MIME-Type</a><a name="Add_your_MIME-Type"></a></h3><p>Tika loads the core, standard MIME-Types from the file &quot;org/apache/tika/mime/tika-mimetypes.xml&quot;, which comes from <a class="externalLink" href="http://svn.apache.org/repos/asf/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml">tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml</a> . If your new MIME-Type is a standard one which is missing from Tika, submit a patch for this file!</p><p>If your MIME-Type needs adding, create a new file &quot;org/apache/tika/mime/custom-mimetypes.xml&quot; in your codebase. You should add to it something like this:</p><div><pre> &lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;
+ &lt;mime-info&gt;
+   &lt;mime-type type=&quot;application/hello&quot;&gt;
+          &lt;glob pattern=&quot;*.hi&quot;/&gt;
+   &lt;/mime-type&gt;
+ &lt;/mime-info&gt;</pre></div></div><div class="section"><h3><a name="Create_your_Parser_class">Create your Parser class</a><a name="Create_your_Parser_class"></a></h3><p>Now, you need to create your new parser. This is a class that must implement the Parser interface offered by Tika. Instead of implementing the Parser interface directly, it is recommended that you extend the abstract class AbstractParser if possible. AbstractParser handles translating between API changes for you.</p><p>A very simple Tika Parser looks like this:</p><div><pre>/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the &quot;License&quot;); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an &quot;AS IS&quot; BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * 
+ * @Author: Arturo Beltran
+ */
+package org.apache.tika.parser.hello;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class HelloParser extends AbstractParser {
+
+        private static final Set&lt;MediaType&gt; SUPPORTED_TYPES = Collections.singleton(MediaType.application(&quot;hello&quot;));
+        public static final String HELLO_MIME_TYPE = &quot;application/hello&quot;;
+        
+        public Set&lt;MediaType&gt; getSupportedTypes(ParseContext context) {
+                return SUPPORTED_TYPES;
+        }
+
+        public void parse(
+                        InputStream stream, ContentHandler handler,
+                        Metadata metadata, ParseContext context)
+                        throws IOException, SAXException, TikaException {
+
+                metadata.set(Metadata.CONTENT_TYPE, HELLO_MIME_TYPE);
+                metadata.set(&quot;Hello&quot;, &quot;World&quot;);
+
+                XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+                xhtml.startDocument();
+                xhtml.endDocument();
+        }
+}</pre></div><p>Pay special attention to the definition of the SUPPORTED_TYPES static class field in the parser class that defines what MIME-Types it supports. If your MIME-Types aren't standard ones, ensure you listed them in a &quot;custom-mimetypes.xml&quot; file so that Tika knows about them (see above).</p><p>Is in the &quot;parse&quot; method where you will do all your work. This is, extract the information of the resource and then set the metadata.</p></div><div class="section"><h3><a name="List_the_new_parser">List the new parser</a><a name="List_the_new_parser"></a></h3><p>Finally, you should explicitly tell the AutoDetectParser to include your new parser. This step is only needed if you want to use the AutoDetectParser functionality. If you figure out the correct parser in a different way, it isn't needed. </p><p>List your new parser in: <a class="externalLink" href="http://svn.apache.org/repos/asf/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.ap
 ache.tika.parser.Parser">tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser</a></p></div></div>
+      </div>
+      <div id="sidebar">
+        <div id="navigation">
+                    <h5>Apache Tika</h5>
+            <ul>
+              
+    <li class="none">
+                    <a href="../index.html">Introduction</a>
+          </li>
+              
+    <li class="none">
+                    <a href="../download.html">Download</a>
+          </li>
+              
+    <li class="none">
+                    <a href="../mail-lists.html">Mailing Lists</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://wiki.apache.org/tika/" class="externalLink">Tika Wiki</a>
+          </li>
+              
+    <li class="none">
+                    <a href="https://issues.apache.org/jira/browse/TIKA" class="externalLink">Issue Tracker</a>
+          </li>
+          </ul>
+              <h5>Documentation</h5>
+            <ul>
+              
+          
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="expanded">
+                    <a href="../0.10/index.html">Apache Tika 0.10</a>
+                  <ul>
+                  
+    <li class="none">
+                    <a href="../0.10/gettingstarted.html">Getting Started</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../0.10/formats.html">Supported Formats</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../0.10/parser.html">Parser API</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../0.10/parser_guide.html">Parser 5min Quick Start Guide</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../0.10/detection.html">Content and Language Detection</a>
+          </li>
+                  
+    <li class="none">
+                    <a href="../0.10/api/">API Documentation</a>
+          </li>
+              </ul>
+        </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../0.9/index.html">Apache Tika 0.9</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../0.8/index.html">Apache Tika 0.8</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../0.7/index.html">Apache Tika 0.7</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../0.6/index.html">Apache Tika 0.6</a>
+                </li>
+              
+                
+                    
+                  
+                  
+                  
+              
+        <li class="collapsed">
+                    <a href="../0.5/index.html">Apache Tika 0.5</a>
+                </li>
+          </ul>
+              <h5>The Apache Software Foundation</h5>
+            <ul>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/foundation/" class="externalLink">About</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/licenses/" class="externalLink">License</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/security/" class="externalLink">Security</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/foundation/sponsorship.html" class="externalLink">Sponsorship</a>
+          </li>
+              
+    <li class="none">
+                    <a href="http://www.apache.org/foundation/thanks.html" class="externalLink">Thanks</a>
+          </li>
+          </ul>
+      
+          <div id="search">
+            <h5>Search with Apache Solr</h5>
+            <form action="http://search.lucidimagination.com/p:tika"
+                  method="get" id="searchform">
+              <input type="text" id="query" name="q"/>
+              <select name="searchProvider" id="searchProvider">
+                <option value="any">provider</option>
+                <option value="lucid">Lucid Find</option>
+                <option value="sl">Search-Lucene</option>
+              </select>
+              <input type="submit" id="submit" value="Search" name="Search"
+                     onclick="selectProvider(this.form)"/>
+            </form>
+          </div>
+
+          <div id="bookpromo">
+            <h5>Books about Tika</h5>
+            <p>
+              <a href="http://manning.com/mattmann/" title="Tika in Action"
+                ><img src="../mattmann_cover150.jpg"
+                      width="150" height="186"/></a>
+            </p>
+          </div>
+        </div>
+      </div>
+      <div id="footer">
+        <p>
+          Copyright &#169; 2011
+          <a href="http://www.apache.org/">The Apache Software Foundation</a>.
+          Site powered by <a href="http://maven.apache.org/">Apache Maven</a>. 
+          Search powered by
+          <a href="http://www.lucidimagination.com">Lucid Imagination</a>
+          and <a href="http://sematext.com">Sematext</a>.
+          <br/>
+          Apache Tika, Tika, Apache, the Apache feather logo, and the Apache
+          Tika project logo are trademarks of The Apache Software Foundation.
+        </p>
+      </div>
+    </div>
+  </body>
+</html>

Modified: tika/site/publish/team-list.html
URL: http://svn.apache.org/viewvc/tika/site/publish/team-list.html?rev=1179842&r1=1179841&r2=1179842&view=diff
==============================================================================
--- tika/site/publish/team-list.html (original)
+++ tika/site/publish/team-list.html Thu Oct  6 20:43:18 2011
@@ -84,7 +84,7 @@
                 width="387" height="100"/></a>
       </div>
       <div id="content">
-        <a name="The_Team"></a><div class="section"><h2>The Team<a name="The_Team"></a></h2><p>A successful project requires many people to play many roles. Some members write code or documentation, while others are valuable as testers, submitting patches and suggestions.</p><p>The team is comprised of Members and Contributors. Members have direct access to the source of a project and actively evolve the code-base. Contributors improve the project through submission of patches and suggestions to the Members. The number of Contributors to the project is unbounded. Get involved today. All contributions to the project are greatly appreciated.</p><a name="Members"></a><div class="section"><h3>Members<a name="Members"></a></h3><p>The following is a list of developers with commit privileges that have directly contributed to the project in one way or another.</p><table align="center" border="0" class="bodyTable"><tr class="a"><th>Id</th><th>Name</th><th>Email</th><th>URL</th><th>Or
 ganization</th><th>Organization URL</th><th>Roles</th><th>Time Zone</th><th>Actual Time (GMT)</th></tr><tr class="b"><td><a name="ridabenjelloun"></a>ridabenjelloun</td><td>Rida Benjelloun</td><td><a class="externalLink" href="mailto:ridabenjelloun@apache.org">ridabenjelloun@apache.org</a></td><td>-</td><td>-</td><td>-</td><td>committer</td><td>-</td><td><span id="developer-0">-</span></td></tr><tr class="a"><td><a name="kbennett"></a>kbennett</td><td>Keith Bennett</td><td>-</td><td>-</td><td>-</td><td>-</td><td>committer</td><td>-</td><td><span id="developer-1">-</span></td></tr><tr class="b"><td><a name="mharwood"></a>mharwood</td><td>Mark Harwood</td><td>-</td><td>-</td><td>-</td><td>-</td><td>committer</td><td>-</td><td><span id="developer-2">-</span></td></tr><tr class="a"><td><a name="kkrugler"></a>kkrugler</td><td>Ken Krugler</td><td><a class="externalLink" href="mailto:kkrugler@apache.org">kkrugler@apache.org</a></td><td><a class="externalLink" href="http://ken-blog.
 krugler.org">http://ken-blog.krugler.org</a></td><td>Bixo Labs</td><td><a class="externalLink" href="http://bixolabs.com">http://bixolabs.com</a></td><td>committer</td><td>-</td><td><span id="developer-3">-</span></td></tr><tr class="b"><td><a name="mattmann"></a>mattmann</td><td>Chris A. Mattmann</td><td><a class="externalLink" href="mailto:mattmann@apache.org">mattmann@apache.org</a></td><td><a class="externalLink" href="http://people.apache.org/~mattmann/">http://people.apache.org/~mattmann/</a></td><td>NASA Jet Propulsion Laboratory</td><td><a class="externalLink" href="http://www.jpl.nasa.gov">http://www.jpl.nasa.gov</a></td><td>committer</td><td>-8</td><td><span id="developer-4">-8</span></td></tr><tr class="a"><td><a name="dmeikle"></a>dmeikle</td><td>Dave Meikle</td><td>-</td><td>-</td><td>-</td><td>-</td><td>committer</td><td>-</td><td><span id="developer-5">-</span></td></tr><tr class="b"><td><a name="siren"></a>siren</td><td>Sami Siren</td><td>-</td><td>-</td><td>
 -</td><td>-</td><td>committer</td><td>-</td><td><span id="developer-6">-</span></td></tr><tr class="a"><td><a name="jukka"></a>jukka</td><td>Jukka Zitting</td><td>-</td><td>-</td><td>-</td><td>-</td><td>committer</td><td>-</td><td><span id="developer-7">-</span></td></tr><tr class="b"><td><a name="nick"></a>nick</td><td>Nick Burch</td><td>-</td><td>-</td><td>Alfresco</td><td><a class="externalLink" href="http://alfresco.com">http://alfresco.com</a></td><td>committer</td><td>-</td><td><span id="developer-8">-</span></td></tr><tr class="a"><td><a name="maxcom"></a>maxcom</td><td>Maxim Valyanskiy</td><td>-</td><td>-</td><td>Jet Infosystems</td><td>-</td><td>committer</td><td>+3</td><td><span id="developer-9">+3</span></td></tr></table></div><a name="Contributors"></a><div class="section"><h3>Contributors<a name="Contributors"></a></h3><p>The following additional people have contributed to this project through the way of suggestions, patches or documentation.</p><table align="ce
 nter" border="0" class="bodyTable"><tr class="b"><th>Name</th><th>Roles</th></tr><tr class="a"><td>Doug Cutting</td><td>mentor</td></tr><tr class="b"><td>Bertrand Delacretaz</td><td>mentor</td></tr><tr class="a"><td>Niall Pemberton</td><td>emeritus</td></tr></table></div></div><script type="text/javascript">
+        <a name="The_Team"></a><div class="section"><h2>The Team<a name="The_Team"></a></h2><p>A successful project requires many people to play many roles. Some members write code or documentation, while others are valuable as testers, submitting patches and suggestions.</p><p>The team is comprised of Members and Contributors. Members have direct access to the source of a project and actively evolve the code-base. Contributors improve the project through submission of patches and suggestions to the Members. The number of Contributors to the project is unbounded. Get involved today. All contributions to the project are greatly appreciated.</p><a name="Members"></a><div class="section"><h3>Members<a name="Members"></a></h3><p>The following is a list of developers with commit privileges that have directly contributed to the project in one way or another.</p><table align="center" border="0" class="bodyTable"><tr class="a"><th>Id</th><th>Name</th><th>Email</th><th>URL</th><th>Or
 ganization</th><th>Organization URL</th><th>Roles</th><th>Time Zone</th><th>Actual Time (GMT)</th></tr><tr class="b"><td><a name="ridabenjelloun"></a>ridabenjelloun</td><td>Rida Benjelloun</td><td><a class="externalLink" href="mailto:ridabenjelloun@apache.org">ridabenjelloun@apache.org</a></td><td>-</td><td>-</td><td>-</td><td>committer</td><td>-</td><td><span id="developer-0">-</span></td></tr><tr class="a"><td><a name="kbennett"></a>kbennett</td><td>Keith Bennett</td><td>-</td><td>-</td><td>-</td><td>-</td><td>committer</td><td>-</td><td><span id="developer-1">-</span></td></tr><tr class="b"><td><a name="mharwood"></a>mharwood</td><td>Mark Harwood</td><td>-</td><td>-</td><td>-</td><td>-</td><td>committer</td><td>-</td><td><span id="developer-2">-</span></td></tr><tr class="a"><td><a name="kkrugler"></a>kkrugler</td><td>Ken Krugler</td><td><a class="externalLink" href="mailto:kkrugler@apache.org">kkrugler@apache.org</a></td><td><a class="externalLink" href="http://ken-blog.
 krugler.org">http://ken-blog.krugler.org</a></td><td>Bixo Labs</td><td><a class="externalLink" href="http://bixolabs.com">http://bixolabs.com</a></td><td>committer</td><td>-</td><td><span id="developer-3">-</span></td></tr><tr class="b"><td><a name="mattmann"></a>mattmann</td><td>Chris A. Mattmann</td><td><a class="externalLink" href="mailto:mattmann@apache.org">mattmann@apache.org</a></td><td><a class="externalLink" href="http://people.apache.org/~mattmann/">http://people.apache.org/~mattmann/</a></td><td>NASA Jet Propulsion Laboratory</td><td><a class="externalLink" href="http://www.jpl.nasa.gov">http://www.jpl.nasa.gov</a></td><td>committer</td><td>-8</td><td><span id="developer-4">-8</span></td></tr><tr class="a"><td><a name="mikemccand"></a>mikemccand</td><td>Michael McCandless</td><td><a class="externalLink" href="mailto:mikemccand@apache.org">mikemccand@apache.org</a></td><td>-</td><td>IBM</td><td>-</td><td>committer</td><td>-</td><td><span id="developer-5">-</span></
 td></tr><tr class="b"><td><a name="dmeikle"></a>dmeikle</td><td>Dave Meikle</td><td>-</td><td>-</td><td>-</td><td>-</td><td>committer</td><td>-</td><td><span id="developer-6">-</span></td></tr><tr class="a"><td><a name="siren"></a>siren</td><td>Sami Siren</td><td>-</td><td>-</td><td>-</td><td>-</td><td>committer</td><td>-</td><td><span id="developer-7">-</span></td></tr><tr class="b"><td><a name="jukka"></a>jukka</td><td>Jukka Zitting</td><td>-</td><td>-</td><td>-</td><td>-</td><td>committer</td><td>-</td><td><span id="developer-8">-</span></td></tr><tr class="a"><td><a name="nick"></a>nick</td><td>Nick Burch</td><td>-</td><td>-</td><td>Alfresco</td><td><a class="externalLink" href="http://alfresco.com">http://alfresco.com</a></td><td>committer</td><td>-</td><td><span id="developer-9">-</span></td></tr><tr class="b"><td><a name="maxcom"></a>maxcom</td><td>Maxim Valyanskiy</td><td>-</td><td>-</td><td>Jet Infosystems</td><td>-</td><td>committer</td><td>+3</td><td><span id="dev
 eloper-10">+3</span></td></tr><tr class="a"><td><a name="oleg"></a>oleg</td><td>Oleg Tikhonov</td><td>-</td><td>-</td><td>-</td><td>-</td><td>committer</td><td>+2</td><td><span id="developer-11">+2</span></td></tr></table></div><a name="Contributors"></a><div class="section"><h3>Contributors<a name="Contributors"></a></h3><p>The following additional people have contributed to this project through the way of suggestions, patches or documentation.</p><table align="center" border="0" class="bodyTable"><tr class="b"><th>Name</th><th>Roles</th></tr><tr class="a"><td>Doug Cutting</td><td>mentor</td></tr><tr class="b"><td>Bertrand Delacretaz</td><td>mentor</td></tr><tr class="a"><td>Niall Pemberton</td><td>emeritus</td></tr></table></div></div><script type="text/javascript">
 function offsetDate(id, offset) {
     var now = new Date();
     var nowTime = now.getTime();
@@ -97,7 +97,8 @@ function offsetDate(id, offset) {
 
 function init(){
     offsetDate('developer-4', '-8');
-    offsetDate('developer-9', '+3');
+    offsetDate('developer-10', '+3');
+    offsetDate('developer-11', '+2');
 }
 
 window.onLoad = init();

Added: tika/site/src/site/apt/1.0/parser_guide.apt
URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/1.0/parser_guide.apt?rev=1179842&view=auto
==============================================================================
--- tika/site/src/site/apt/1.0/parser_guide.apt (added)
+++ tika/site/src/site/apt/1.0/parser_guide.apt Thu Oct  6 20:43:18 2011
@@ -0,0 +1,143 @@
+                       --------------------------------------------
+                       Get Tika parsing up and running in 5 minutes
+                       --------------------------------------------
+					   Arturo Beltran
+					   --------------------------------------------
+
+~~ Licensed to the Apache Software Foundation (ASF) under one or more
+~~ contributor license agreements.  See the NOTICE file distributed with
+~~ this work for additional information regarding copyright ownership.
+~~ The ASF licenses this file to You under the Apache License, Version 2.0
+~~ (the "License"); you may not use this file except in compliance with
+~~ the License.  You may obtain a copy of the License at
+~~
+~~     http://www.apache.org/licenses/LICENSE-2.0
+~~
+~~ Unless required by applicable law or agreed to in writing, software
+~~ distributed under the License is distributed on an "AS IS" BASIS,
+~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+~~ See the License for the specific language governing permissions and
+~~ limitations under the License.
+
+Get Tika parsing up and running in 5 minutes
+
+   This page is a quick start guide showing how to add a new parser to Apache Tika.
+   Following the simple steps listed below your new parser can be running in only 5 minutes.
+
+%{toc|section=1|fromDepth=1}
+
+* {Getting Started}
+
+   The {{{gettingstarted.html}Getting Started}} document describes how to 
+   build Apache Tika from sources and how to start using Tika in an application. Pay close attention 
+   and follow the instructions in the "Getting and building the sources" section.
+   
+
+* {Add your MIME-Type}
+
+   Tika loads the core, standard MIME-Types from the file 
+   "org/apache/tika/mime/tika-mimetypes.xml", which comes from
+   {{{http://svn.apache.org/repos/asf/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}} . 
+   If your new MIME-Type is a standard one which is missing from Tika, 
+   submit a patch for this file!
+
+   If your MIME-Type needs adding, create a new file 
+   "org/apache/tika/mime/custom-mimetypes.xml" in your codebase. 
+   You should add to it something like this:
+   
+---
+ <?xml version="1.0" encoding="UTF-8"?>
+ <mime-info>
+   <mime-type type="application/hello">
+	  <glob pattern="*.hi"/>
+   </mime-type>
+ </mime-info>
+---
+
+* {Create your Parser class}
+
+   Now, you need to create your new parser. This is a class that must 
+   implement the Parser interface offered by Tika. Instead of implementing 
+   the Parser interface directly, it is recommended that you extend the
+   abstract class AbstractParser if possible. AbstractParser handles
+   translating between API changes for you.
+
+   A very simple Tika Parser looks like this:
+   
+---
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * 
+ * @Author: Arturo Beltran
+ */
+package org.apache.tika.parser.hello;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class HelloParser extends AbstractParser {
+
+	private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("hello"));
+	public static final String HELLO_MIME_TYPE = "application/hello";
+	
+	public Set<MediaType> getSupportedTypes(ParseContext context) {
+		return SUPPORTED_TYPES;
+	}
+
+	public void parse(
+			InputStream stream, ContentHandler handler,
+			Metadata metadata, ParseContext context)
+			throws IOException, SAXException, TikaException {
+
+		metadata.set(Metadata.CONTENT_TYPE, HELLO_MIME_TYPE);
+		metadata.set("Hello", "World");
+
+		XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+		xhtml.startDocument();
+		xhtml.endDocument();
+	}
+}
+---
+   
+   Pay special attention to the definition of the SUPPORTED_TYPES static class 
+   field in the parser class that defines what MIME-Types it supports. If
+   your MIME-Types aren't standard ones, ensure you listed them in a 
+   "custom-mimetypes.xml" file so that Tika knows about them (see above).
+   
+   Is in the "parse" method where you will do all your work. This is, extract 
+   the information of the resource and then set the metadata.
+
+* {List the new parser}
+
+   Finally, you should explicitly tell the AutoDetectParser to include your new 
+   parser. This step is only needed if you want to use the AutoDetectParser functionality. 
+   If you figure out the correct parser in a different way, it isn't needed. 
+   
+   List your new parser in:
+    {{{http://svn.apache.org/repos/asf/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}}
+   
+

Propchange: tika/site/src/site/apt/1.0/parser_guide.apt
------------------------------------------------------------------------------
    svn:executable = *