You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2016/02/20 01:42:14 UTC

svn commit: r1731321 [14/14] - in /tika/site: publish/ publish/0.10/ publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/ publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12/ publish/1.13/ publish/1.2/ publish/1.3/ publish/1....

Added: tika/site/src/site/apt/1.12/parser_guide.apt
URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/1.12/parser_guide.apt?rev=1731321&view=auto
==============================================================================
--- tika/site/src/site/apt/1.12/parser_guide.apt (added)
+++ tika/site/src/site/apt/1.12/parser_guide.apt Sat Feb 20 00:42:12 2016
@@ -0,0 +1,141 @@
+                       --------------------------------------------
+                       Get Tika parsing up and running in 5 minutes
+                       --------------------------------------------
+
+~~ Licensed to the Apache Software Foundation (ASF) under one or more
+~~ contributor license agreements.  See the NOTICE file distributed with
+~~ this work for additional information regarding copyright ownership.
+~~ The ASF licenses this file to You under the Apache License, Version 2.0
+~~ (the "License"); you may not use this file except in compliance with
+~~ the License.  You may obtain a copy of the License at
+~~
+~~     http://www.apache.org/licenses/LICENSE-2.0
+~~
+~~ Unless required by applicable law or agreed to in writing, software
+~~ distributed under the License is distributed on an "AS IS" BASIS,
+~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+~~ See the License for the specific language governing permissions and
+~~ limitations under the License.
+
+Get Tika parsing up and running in 5 minutes
+
+   This page is a quick start guide showing how to add a new parser to Apache Tika.
+   Following the simple steps listed below your new parser can be running in only 5 minutes.
+
+%{toc|section=1|fromDepth=1}
+
+* {Getting Started}
+
+   The {{{./gettingstarted.html}Getting Started}} document describes how to 
+   build Apache Tika from sources and how to start using Tika in an application. Pay close attention 
+   and follow the instructions in the "Getting and building the sources" section.
+   
+
+* {Add your MIME-Type}
+
+   Tika loads the core, standard MIME-Types from the file 
+   "org/apache/tika/mime/tika-mimetypes.xml", which comes from
+   {{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml;hb=refs/heads/master}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}} . 
+   If your new MIME-Type is a standard one which is missing from Tika, 
+   submit a patch for this file!
+
+   If your MIME-Type needs adding, create a new file 
+   "org/apache/tika/mime/custom-mimetypes.xml" in your codebase. 
+   You should add to it something like this:
+   
+---
+ <?xml version="1.0" encoding="UTF-8"?>
+ <mime-info>
+   <mime-type type="application/hello">
+	  <glob pattern="*.hi"/>
+   </mime-type>
+ </mime-info>
+---
+
+* {Create your Parser class}
+
+   Now, you need to create your new parser. This is a class that must 
+   implement the Parser interface offered by Tika. Instead of implementing 
+   the Parser interface directly, it is recommended that you extend the
+   abstract class AbstractParser if possible. AbstractParser handles
+   translating between API changes for you.
+
+   A very simple Tika Parser looks like this:
+   
+---
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * 
+ * @Author: Arturo Beltran
+ */
+package org.apache.tika.parser.hello;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class HelloParser extends AbstractParser {
+
+	private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("hello"));
+	public static final String HELLO_MIME_TYPE = "application/hello";
+	
+	public Set<MediaType> getSupportedTypes(ParseContext context) {
+		return SUPPORTED_TYPES;
+	}
+
+	public void parse(
+			InputStream stream, ContentHandler handler,
+			Metadata metadata, ParseContext context)
+			throws IOException, SAXException, TikaException {
+
+		metadata.set(Metadata.CONTENT_TYPE, HELLO_MIME_TYPE);
+		metadata.set("Hello", "World");
+
+		XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+		xhtml.startDocument();
+		xhtml.endDocument();
+	}
+}
+---
+   
+   Pay special attention to the definition of the SUPPORTED_TYPES static class 
+   field in the parser class that defines what MIME-Types it supports. If
+   your MIME-Types aren't standard ones, ensure you listed them in a 
+   "custom-mimetypes.xml" file so that Tika knows about them (see above).
+   
+   Is in the "parse" method where you will do all your work. This is, extract 
+   the information of the resource and then set the metadata.
+
+* {List the new parser}
+
+   Finally, you should explicitly tell the AutoDetectParser to include your new 
+   parser. This step is only needed if you want to use the AutoDetectParser functionality. 
+   If you figure out the correct parser in a different way, it isn't needed. 
+   
+   List your new parser in:
+    {{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser;hb=refs/heads/master}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}}
+   
+

Modified: tika/site/src/site/apt/download.apt.vm
URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/download.apt.vm?rev=1731321&r1=1731320&r2=1731321&view=diff
==============================================================================
--- tika/site/src/site/apt/download.apt.vm (original)
+++ tika/site/src/site/apt/download.apt.vm Sat Feb 20 00:42:12 2016
@@ -25,18 +25,18 @@ Download Apache Tika
 
    * {{{http://www.apache.org/dyn/closer.cgi/tika/tika-${project.parent.version}-src.zip}Mirrors for apache-tika-${project.parent.version}-src.zip}}
      (source archive, {{{http://www.apache.org/dist/tika/tika-${project.parent.version}-src.zip.asc}PGP signature}})\
-     SHA1: <<<d0dde7b3a4f1a2fb6ccd741552ea180dddab630a>>>\
-     MD5: <<<ccca11a7e5c300e438b2a52012cf4e39>>>
+     SHA1: <<<30e64645af643959841ac3bb3c41f7e64eba7e5f>>>\
+     MD5: <<<ccf8adb2260476244618a488a905490b>>>
 
    * {{{http://www.apache.org/dyn/closer.cgi/tika/tika-app-${project.parent.version}.jar}Mirrors for tika-app-${project.parent.version}.jar}}
      (runnable jar, {{{http://www.apache.org/dist/tika/tika-app-${project.parent.version}.jar.asc}PGP signature}})\
-     SHA1: <<<59cc7c4c48a6a41899ca282d925b2738d05a45a8>>>\
-     MD5: <<<3e133bcb3cd709fddd1bda3eebc1a0e5>>>\
+     SHA1: <<<8d5c5f9e14b53a807a9d3d99ef34e63c38b9b418>>>\
+     MD5: <<<bf0346321c71ff62f514e096086f5346>>>\
 
    * {{{http://www.apache.org/dyn/closer.cgi/tika/tika-server-${project.parent.version}.jar}Mirrors for tika-server-${project.parent.version}.jar}}
      (runnable jar, {{{http://www.apache.org/dist/tika/tika-server-${project.parent.version}.jar.asc}PGP signature}})\
-     SHA1: <<<c1ca6453573fb7fa1f6b3d81dc4c9847a9a86a62>>>\
-     MD5: <<<7e28f3288c3bcd0c26ac6f557ddfb977>>>
+     SHA1: <<<e9655cbf4f15e9d2934d697708b66d9eeeca4ee1>>>\
+     MD5: <<<cf34921c57ef5d6002f3088536d2f2ed>>>
 
    []
 

Modified: tika/site/src/site/apt/index.apt.vm
URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/index.apt.vm?rev=1731321&r1=1731320&r2=1731321&view=diff
==============================================================================
--- tika/site/src/site/apt/index.apt.vm (original)
+++ tika/site/src/site/apt/index.apt.vm Sat Feb 20 00:42:12 2016
@@ -39,6 +39,15 @@ Apache Tika - a content analysis toolkit
 
 Latest News
 
+   [19 February 2016: Apache Tika Release]
+    Apache Tika 1.12 has been released! This release includes some improvements
+    to Named Entity Recognition (Stanford NER integration and Apache OpenNLP) 
+    and additionally efficiency improvements to the GeoTopicParser. There are
+    also bugfixes to Tika REST server in this release. Please see the
+    {{{https://dist.apache.org/repos/dist/release/tika/CHANGES-1.12.txt}CHANGES.txt}}
+    file for a full list of changes in this release and have a look at the download
+    page for more information on how to obtain Apache Tika 1.12.
+
    [25 October 2015: Apache Tika Release]
     Apache Tika 1.11 has been released! This release includes several improvements
     that better utilize Java7 support, that help extract more content using the

Modified: tika/site/src/site/site.xml
URL: http://svn.apache.org/viewvc/tika/site/src/site/site.xml?rev=1731321&r1=1731320&r2=1731321&view=diff
==============================================================================
--- tika/site/src/site/site.xml (original)
+++ tika/site/src/site/site.xml Sat Feb 20 00:42:12 2016
@@ -40,7 +40,17 @@
       <item name="Issue Tracker" href="https://issues.apache.org/jira/browse/TIKA"/>
     </menu>
     <menu name="Documentation">
-      <item name="Apache Tika 1.11" href="1.11/index.html">
+      <item name="Apache Tika 1.12" href="1.12/index.html">
+        <item name="Getting Started" href="1.12/gettingstarted.html"/>
+        <item name="Supported Formats" href="1.12/formats.html"/>
+        <item name="Parser API" href="1.12/parser.html"/>
+        <item name="Parser 5min Quick Start Guide" href="1.12/parser_guide.html"/>
+        <item name="Content and Language Detection" href="1.12/detection.html"/>
+        <item name="Configuring Tika" href="1.12/configuring.html"/>
+        <item name="Usage Examples" href="1.12/examples.html"/>
+        <item name="API Documentation" href="1.12/api/"/>
+      </item>
+      <item name="Apache Tika 1.11" href="1.11/index.html" collapse="true">
         <item name="Getting Started" href="1.11/gettingstarted.html"/>
         <item name="Supported Formats" href="1.11/formats.html"/>
         <item name="Parser API" href="1.11/parser.html"/>
@@ -70,15 +80,6 @@
         <item name="Usage Examples" href="1.9/examples.html"/>
         <item name="API Documentation" href="1.9/api/"/>
       </item>
-      <item name="Apache Tika 1.8" href="1.8/index.html" collapse="true">
-        <item name="Getting Started" href="1.8/gettingstarted.html"/>
-        <item name="Supported Formats" href="1.8/formats.html"/>
-        <item name="Parser API" href="1.8/parser.html"/>
-        <item name="Parser 5min Quick Start Guide" href="1.8/parser_guide.html"/>
-        <item name="Content and Language Detection" href="1.8/detection.html"/>
-        <item name="Usage Examples" href="1.8/examples.html"/>
-        <item name="API Documentation" href="1.8/api/"/>
-      </item>
     </menu>
     <menu name="The Apache Software Foundation">
       <item name="About" href="http://www.apache.org/foundation/"/>