You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2016/02/20 01:42:14 UTC
svn commit: r1731321 [14/14] - in /tika/site: publish/ publish/0.10/
publish/0.5/ publish/0.6/ publish/0.7/ publish/0.8/ publish/0.9/
publish/1.0/ publish/1.1/ publish/1.10/ publish/1.11/ publish/1.12/
publish/1.13/ publish/1.2/ publish/1.3/ publish/1....
Added: tika/site/src/site/apt/1.12/parser_guide.apt
URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/1.12/parser_guide.apt?rev=1731321&view=auto
==============================================================================
--- tika/site/src/site/apt/1.12/parser_guide.apt (added)
+++ tika/site/src/site/apt/1.12/parser_guide.apt Sat Feb 20 00:42:12 2016
@@ -0,0 +1,141 @@
+ --------------------------------------------
+ Get Tika parsing up and running in 5 minutes
+ --------------------------------------------
+
+~~ Licensed to the Apache Software Foundation (ASF) under one or more
+~~ contributor license agreements. See the NOTICE file distributed with
+~~ this work for additional information regarding copyright ownership.
+~~ The ASF licenses this file to You under the Apache License, Version 2.0
+~~ (the "License"); you may not use this file except in compliance with
+~~ the License. You may obtain a copy of the License at
+~~
+~~ http://www.apache.org/licenses/LICENSE-2.0
+~~
+~~ Unless required by applicable law or agreed to in writing, software
+~~ distributed under the License is distributed on an "AS IS" BASIS,
+~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+~~ See the License for the specific language governing permissions and
+~~ limitations under the License.
+
+Get Tika parsing up and running in 5 minutes
+
+ This page is a quick start guide showing how to add a new parser to Apache Tika.
+ Following the simple steps listed below your new parser can be running in only 5 minutes.
+
+%{toc|section=1|fromDepth=1}
+
+* {Getting Started}
+
+ The {{{./gettingstarted.html}Getting Started}} document describes how to
+ build Apache Tika from sources and how to start using Tika in an application. Pay close attention
+ and follow the instructions in the "Getting and building the sources" section.
+
+
+* {Add your MIME-Type}
+
+ Tika loads the core, standard MIME-Types from the file
+ "org/apache/tika/mime/tika-mimetypes.xml", which comes from
+ {{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml;hb=refs/heads/master}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}} .
+ If your new MIME-Type is a standard one which is missing from Tika,
+ submit a patch for this file!
+
+ If your MIME-Type needs adding, create a new file
+ "org/apache/tika/mime/custom-mimetypes.xml" in your codebase.
+ You should add to it something like this:
+
+---
+ <?xml version="1.0" encoding="UTF-8"?>
+ <mime-info>
+ <mime-type type="application/hello">
+ <glob pattern="*.hi"/>
+ </mime-type>
+ </mime-info>
+---
+
+* {Create your Parser class}
+
+ Now, you need to create your new parser. This is a class that must
+ implement the Parser interface offered by Tika. Instead of implementing
+ the Parser interface directly, it is recommended that you extend the
+ abstract class AbstractParser if possible. AbstractParser handles
+ translating between API changes for you.
+
+ A very simple Tika Parser looks like this:
+
+---
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * @Author: Arturo Beltran
+ */
+package org.apache.tika.parser.hello;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class HelloParser extends AbstractParser {
+
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("hello"));
+ public static final String HELLO_MIME_TYPE = "application/hello";
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ metadata.set(Metadata.CONTENT_TYPE, HELLO_MIME_TYPE);
+ metadata.set("Hello", "World");
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+}
+---
+
+ Pay special attention to the definition of the SUPPORTED_TYPES static class
+ field in the parser class that defines what MIME-Types it supports. If
+ your MIME-Types aren't standard ones, ensure you listed them in a
+ "custom-mimetypes.xml" file so that Tika knows about them (see above).
+
+ Is in the "parse" method where you will do all your work. This is, extract
+ the information of the resource and then set the metadata.
+
+* {List the new parser}
+
+ Finally, you should explicitly tell the AutoDetectParser to include your new
+ parser. This step is only needed if you want to use the AutoDetectParser functionality.
+ If you figure out the correct parser in a different way, it isn't needed.
+
+ List your new parser in:
+ {{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser;hb=refs/heads/master}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}}
+
+
Modified: tika/site/src/site/apt/download.apt.vm
URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/download.apt.vm?rev=1731321&r1=1731320&r2=1731321&view=diff
==============================================================================
--- tika/site/src/site/apt/download.apt.vm (original)
+++ tika/site/src/site/apt/download.apt.vm Sat Feb 20 00:42:12 2016
@@ -25,18 +25,18 @@ Download Apache Tika
* {{{http://www.apache.org/dyn/closer.cgi/tika/tika-${project.parent.version}-src.zip}Mirrors for apache-tika-${project.parent.version}-src.zip}}
(source archive, {{{http://www.apache.org/dist/tika/tika-${project.parent.version}-src.zip.asc}PGP signature}})\
- SHA1: <<<d0dde7b3a4f1a2fb6ccd741552ea180dddab630a>>>\
- MD5: <<<ccca11a7e5c300e438b2a52012cf4e39>>>
+ SHA1: <<<30e64645af643959841ac3bb3c41f7e64eba7e5f>>>\
+ MD5: <<<ccf8adb2260476244618a488a905490b>>>
* {{{http://www.apache.org/dyn/closer.cgi/tika/tika-app-${project.parent.version}.jar}Mirrors for tika-app-${project.parent.version}.jar}}
(runnable jar, {{{http://www.apache.org/dist/tika/tika-app-${project.parent.version}.jar.asc}PGP signature}})\
- SHA1: <<<59cc7c4c48a6a41899ca282d925b2738d05a45a8>>>\
- MD5: <<<3e133bcb3cd709fddd1bda3eebc1a0e5>>>\
+ SHA1: <<<8d5c5f9e14b53a807a9d3d99ef34e63c38b9b418>>>\
+ MD5: <<<bf0346321c71ff62f514e096086f5346>>>\
* {{{http://www.apache.org/dyn/closer.cgi/tika/tika-server-${project.parent.version}.jar}Mirrors for tika-server-${project.parent.version}.jar}}
(runnable jar, {{{http://www.apache.org/dist/tika/tika-server-${project.parent.version}.jar.asc}PGP signature}})\
- SHA1: <<<c1ca6453573fb7fa1f6b3d81dc4c9847a9a86a62>>>\
- MD5: <<<7e28f3288c3bcd0c26ac6f557ddfb977>>>
+ SHA1: <<<e9655cbf4f15e9d2934d697708b66d9eeeca4ee1>>>\
+ MD5: <<<cf34921c57ef5d6002f3088536d2f2ed>>>
[]
Modified: tika/site/src/site/apt/index.apt.vm
URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/index.apt.vm?rev=1731321&r1=1731320&r2=1731321&view=diff
==============================================================================
--- tika/site/src/site/apt/index.apt.vm (original)
+++ tika/site/src/site/apt/index.apt.vm Sat Feb 20 00:42:12 2016
@@ -39,6 +39,15 @@ Apache Tika - a content analysis toolkit
Latest News
+ [19 February 2016: Apache Tika Release]
+ Apache Tika 1.12 has been released! This release includes some improvements
+ to Named Entity Recognition (Stanford NER integration and Apache OpenNLP)
+ and additionally efficiency improvements to the GeoTopicParser. There are
+ also bugfixes to Tika REST server in this release. Please see the
+ {{{https://dist.apache.org/repos/dist/release/tika/CHANGES-1.12.txt}CHANGES.txt}}
+ file for a full list of changes in this release and have a look at the download
+ page for more information on how to obtain Apache Tika 1.12.
+
[25 October 2015: Apache Tika Release]
Apache Tika 1.11 has been released! This release includes several improvements
that better utilize Java7 support, that help extract more content using the
Modified: tika/site/src/site/site.xml
URL: http://svn.apache.org/viewvc/tika/site/src/site/site.xml?rev=1731321&r1=1731320&r2=1731321&view=diff
==============================================================================
--- tika/site/src/site/site.xml (original)
+++ tika/site/src/site/site.xml Sat Feb 20 00:42:12 2016
@@ -40,7 +40,17 @@
<item name="Issue Tracker" href="https://issues.apache.org/jira/browse/TIKA"/>
</menu>
<menu name="Documentation">
- <item name="Apache Tika 1.11" href="1.11/index.html">
+ <item name="Apache Tika 1.12" href="1.12/index.html">
+ <item name="Getting Started" href="1.12/gettingstarted.html"/>
+ <item name="Supported Formats" href="1.12/formats.html"/>
+ <item name="Parser API" href="1.12/parser.html"/>
+ <item name="Parser 5min Quick Start Guide" href="1.12/parser_guide.html"/>
+ <item name="Content and Language Detection" href="1.12/detection.html"/>
+ <item name="Configuring Tika" href="1.12/configuring.html"/>
+ <item name="Usage Examples" href="1.12/examples.html"/>
+ <item name="API Documentation" href="1.12/api/"/>
+ </item>
+ <item name="Apache Tika 1.11" href="1.11/index.html" collapse="true">
<item name="Getting Started" href="1.11/gettingstarted.html"/>
<item name="Supported Formats" href="1.11/formats.html"/>
<item name="Parser API" href="1.11/parser.html"/>
@@ -70,15 +80,6 @@
<item name="Usage Examples" href="1.9/examples.html"/>
<item name="API Documentation" href="1.9/api/"/>
</item>
- <item name="Apache Tika 1.8" href="1.8/index.html" collapse="true">
- <item name="Getting Started" href="1.8/gettingstarted.html"/>
- <item name="Supported Formats" href="1.8/formats.html"/>
- <item name="Parser API" href="1.8/parser.html"/>
- <item name="Parser 5min Quick Start Guide" href="1.8/parser_guide.html"/>
- <item name="Content and Language Detection" href="1.8/detection.html"/>
- <item name="Usage Examples" href="1.8/examples.html"/>
- <item name="API Documentation" href="1.8/api/"/>
- </item>
</menu>
<menu name="The Apache Software Foundation">
<item name="About" href="http://www.apache.org/foundation/"/>