You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2016/11/11 00:09:36 UTC
svn commit: r1769232 - in /tika/site/src/site/apt/1.14: configuring.apt
detection.apt parser.apt parser_guide.apt
Author: mattmann
Date: Fri Nov 11 00:09:36 2016
New Revision: 1769232
URL: http://svn.apache.org/viewvc?rev=1769232&view=rev
Log:
Apache Tika 1.14 docs.
Added:
tika/site/src/site/apt/1.14/configuring.apt
tika/site/src/site/apt/1.14/detection.apt
tika/site/src/site/apt/1.14/parser.apt
tika/site/src/site/apt/1.14/parser_guide.apt
Added: tika/site/src/site/apt/1.14/configuring.apt
URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/1.14/configuring.apt?rev=1769232&view=auto
==============================================================================
--- tika/site/src/site/apt/1.14/configuring.apt (added)
+++ tika/site/src/site/apt/1.14/configuring.apt Fri Nov 11 00:09:36 2016
@@ -0,0 +1,223 @@
+ ----------------
+ Configuring Tika
+ ----------------
+
+~~ Licensed to the Apache Software Foundation (ASF) under one or more
+~~ contributor license agreements. See the NOTICE file distributed with
+~~ this work for additional information regarding copyright ownership.
+~~ The ASF licenses this file to You under the Apache License, Version 2.0
+~~ (the "License"); you may not use this file except in compliance with
+~~ the License. You may obtain a copy of the License at
+~~
+~~ http://www.apache.org/licenses/LICENSE-2.0
+~~
+~~ Unless required by applicable law or agreed to in writing, software
+~~ distributed under the License is distributed on an "AS IS" BASIS,
+~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+~~ See the License for the specific language governing permissions and
+~~ limitations under the License.
+
+Configuring Tika
+
+ Out of the box, Apache Tika will attempt to start with all available
+ Detectors and Parsers, running with sensible defaults. For most users,
+ this default configuration will work well.
+
+ This page gives you information on how to configure the various
+ components of Apache Tika, such as Parsers and Detectors, if you need
+ fine-grained control over ordering, exclusions and the like.
+
+%{toc|section=1|fromDepth=1}
+
+* {Configuring Parsers}
+
+ Through the Tika Config xml, it is possible to have a high degree of control
+ over which parsers are or aren't used, in what order of preferences etc. It
+ is also possible to override just certain parts, to (for example) have "default
+ except for PDF".
+
+ Currently, it is only possible to have a single parser run against a document.
+ There is on-going discussion around fallback parsers and combining the output
+ of multiple parsers running on a document, but none of these are available yet.
+
+ To override some parser certain default behaviours, include the <<< DefaultParser >>>
+ in your configuration, with excludes, then add other parser definitions in.
+ To prevent the <<< DefaultParser >>> (with its auto-discovery) being used,
+ simply omit it from your config, and list all other parsers you want instead.
+
+ To override just some default behaviour, you can use a Tika Config something
+ like this:
+
+---
+<?xml version="1.0" encoding="UTF-8"?>
+<properties>
+ <parsers>
+ <!-- Default Parser for most things, except for 2 mime types, and never
+ use the Executable Parser -->
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <mime-exclude>image/jpeg</mime-exclude>
+ <mime-exclude>application/pdf</mime-exclude>
+ <parser-exclude class="org.apache.tika.parser.executable.ExecutableParser"/>
+ </parser>
+ <!-- Use a different parser for PDF -->
+ <parser class="org.apache.tika.parser.EmptyParser">
+ <mime>application/pdf</mime>
+ </parser>
+ </parsers>
+</properties>
+---
+
+ To configure things in code, the key classes to use to build up your own custom
+ parser heirarchy are
+ {{{./api/org/apache/tika/parser/DefaultParser.html}org.apache.tika.parser.DefaultParser}},
+ {{{./api/org/apache/tika/parser/CompositeParser.html}org.apache.tika.parser.CompositeParser}}
+ and
+ {{{./api/org/apache/tika/parser/ParserDecorator.html}org.apache.tika.parser.ParserDecorator}}.
+
+* {Configuring Detectors}
+
+ Through the Tika Config xml, it is possible to have a high degree of control
+ over which detectors are or aren't used, in what order of preferences etc. It
+ is also possible to override just certain parts, to (for example) have "default
+ except for no POIFS Container Detction".
+
+ To override some detector certain default behaviours, include the
+ <<< DefaultDetector >>>, with any <<< detector-exclude >>> entries you need,
+ in your configuration, then add other detectors definitions in. To prevent
+ the <<< DefaultParser >>> (with its auto-discovery) being used, simply omit it
+ from your config, and list all other detectors you want instead.
+
+ To override just some default behaviour, you can use a Tika Config something
+ like this:
+
+---
+<?xml version="1.0" encoding="UTF-8"?>
+<properties>
+ <detectors>
+ <!-- All detectors except built-in container ones -->
+ <detector class="org.apache.tika.detect.DefaultDetector">
+ <detector-exclude class="org.apache.tika.parser.pkg.ZipContainerDetector"/>
+ <detector-exclude class="org.apache.tika.parser.microsoft.POIFSContainerDetector"/>
+ </detector>
+ </detectors>
+</properties>
+---
+
+ Or to just only use certain detectors, you can use a Tika Config something
+ like this:
+
+---
+<?xml version="1.0" encoding="UTF-8"?>
+<properties>
+ <detectors>
+ <!-- Only use these two detectors, and ignore all others -->
+ <detector class="org.apache.tika.parser.pkg.ZipContainerDetector"/>
+ <detector class="org.apache.tika.mime.MimeTypes"/>
+ </detectors>
+</properties>
+---
+
+ In code, the key classes to use to build up your own custom detector
+ heirarchy are
+ {{{./api/org/apache/tika/detect/DefaultDetector.html}org.apache.tika.detect.DefaultDetector}}
+ and
+ {{{./api/org/apache/tika/detect/CompositeDetector.html}org.apache.tika.detect.CompositeDetector}}.
+
+* {Configuring Mime Types}
+
+ TODO Mention non-standard paths, and custom mime type files
+
+* {Configuring Language Identifiers}
+
+ At this time, there is no unified way to configure language identifiers.
+ While the work on that is ongoing, for now you will need to review the
+ {{{./api/}Tika Javadocs}} to see how individual identifiers are configured.
+
+* {Configuring Translators}
+
+ At this time, there is no unified way to configure Translators.
+ While the work on that is ongoing, for now you will need to review the
+ {{{./api/}Tika Javadocs}} to see how individual Translators are configured.
+
+~~ When Translators can have their parameters configured, mention here about
+~~ specifying which single one to use in the Tika Config XML
+
+* {Configuring the Service Loader}
+
+ Tika has a number of service provider types such as parsers, detectors, and translators.
+ The {{{./api/org/apache/tika/config/ServiceLoader.html}org.apache.tika.config.ServiceLoader}} class provides a registry of each type of provider. This allows Tika to create
+ implementations such as {{{./api/org/apache/tika/parser/DefaultParser.html}org.apache.tika.parser.DefaultParser}},
+ {{{./api/org/apache/tika/language/translate/DefaultTranslator.html}org.apache.tika.language.translate.DefaultTranslator}}, and {{{./api/org/apache/tika/detect/DefaultDetector.html}org.apache.tika.detect.DefaultDetector}}
+ that can match the appropriate provider to an incoming piece of content.
+
+ The ServiceLoader's registry can be populated either statically or dynamically.
+
+** Static
+
+ Static loading is the default which requires no configuration. This configuration options is used in
+ Tika deployments where the Tika JAR files reside together in the same classloader hierarchy. The services
+ provides are loaded from provider configuration files located within the tika-parsers JAR file at META-INF/services.
+
+** Dynamic
+
+ Dynamic loading may be required if the tika service providers will reside in different classloaders such as
+ in OSGi. To allow a provider created in tika-config.xml to utilize dynamically loaded services you need to
+ configure the ServiceLoader to be dynamic with the following configuration:
+
+---
+<properties>
+ <service-loader dynamic="true"/>
+ ....
+</properties>
+---
+
+** Load Error Handling
+
+ The ServiceLoader can contains a handler to deal with errors that occur during provider initialization. For example
+ if a class fails to initialize LoadErrorHandler deals with the exception that is thrown.
+ This handler can be configured to:
+
+ * <<< IGNORE >>> - (Default) Do nothing when providers fail to initialize.
+
+ * <<< WARN >>> - Log a warning when providers fail to initialize.
+
+ * <<< THROW >>> - Throw an exception when providers fail to initialize.
+
+ []
+
+ For example to set the LoadErrorHandler to WARN then use the following configuration:
+
+---
+<properties>
+ <service-loader loadErrorHandler="WARN"/>
+ ....
+</properties>
+---
+
+* {Using a Tika Configuration XML file}
+
+ However you call Tika, the System Property of <<< tika.config >>> is
+ checked first, and the Environment Variable of <<< TIKA_CONFIG >>> is
+ tried next. Setting one of those will cause Tika to use your given
+ Tika Config XML file.
+
+ If you are calling Tika from your own code, then you can pass in the
+ location of your Tika Config XML file when you construct your
+ <<<TikaConfig>>> instance. From that, you can fetch your configured
+ parser, detectors etc.
+
+---
+TikaConfig config = new TikaConfig("/path/to/tika-config.xml");
+Detector detector = config.getDetector();
+Parser autoDetectParser = new AutoDetectParser(config);
+---
+
+ For users of the Tika App, in addition to the sytem property and the
+ environement variable, you can also use the
+ <<< --config=[tika-config.xml] >>> option to select a different
+ Tika Config XML file to use
+
+ For users of the Tika Server, in addition to the sytem property and the
+ environement variable, you can also use <<< -c [tika-config.xml] >>> or
+ <<< --config [tika-config.xml] >>> options to select a different
+ Tika Config XML file to use
Added: tika/site/src/site/apt/1.14/detection.apt
URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/1.14/detection.apt?rev=1769232&view=auto
==============================================================================
--- tika/site/src/site/apt/1.14/detection.apt (added)
+++ tika/site/src/site/apt/1.14/detection.apt Fri Nov 11 00:09:36 2016
@@ -0,0 +1,211 @@
+ -----------------
+ Content Detection
+ -----------------
+
+~~ Licensed to the Apache Software Foundation (ASF) under one or more
+~~ contributor license agreements. See the NOTICE file distributed with
+~~ this work for additional information regarding copyright ownership.
+~~ The ASF licenses this file to You under the Apache License, Version 2.0
+~~ (the "License"); you may not use this file except in compliance with
+~~ the License. You may obtain a copy of the License at
+~~
+~~ http://www.apache.org/licenses/LICENSE-2.0
+~~
+~~ Unless required by applicable law or agreed to in writing, software
+~~ distributed under the License is distributed on an "AS IS" BASIS,
+~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+~~ See the License for the specific language governing permissions and
+~~ limitations under the License.
+
+Content Detection
+
+ This page gives you information on how content and language detection
+ works with Apache Tika, and how to tune the behaviour of Tika.
+
+%{toc|section=1|fromDepth=1}
+
+* {The Detector Interface}
+
+ The
+ {{{./api/org/apache/tika/detect/Detector.html}org.apache.tika.detect.Detector}}
+ interface is the basis for most of the content type detection in Apache
+ Tika. All the different ways of detecting content all implement the
+ same common method:
+
+---
+MediaType detect(java.io.InputStream input,
+ Metadata metadata) throws java.io.IOException
+---
+
+ The <<<detect>>> method takes the stream to inspect, and a
+ <<<Metadata>>> object that holds any additional information on
+ the content. The detector will return a
+ {{{./api/org/apache/tika/mime/MediaType.html}MediaType}} object describing
+ its best guess as to the type of the file.
+
+ In general, only two keys on the Metadata object are used by Detectors.
+ These are <<<Metadata.RESOURCE_NAME_KEY>>> which should hold the name
+ of the file (where known), and <<<Metadata.CONTENT_TYPE>>> which should
+ hold the advertised content type of the file (eg from a webserver or
+ a content repository).
+
+
+* {Mime Magic Detection}
+
+ By looking for special ("magic") patterns of bytes near the start of
+ the file, it is often possible to detect the type of the file. For
+ some file types, this is a simple process. For others, typically
+ container based formats, the magic detection may not be enough. (More
+ detail on detecting container formats below)
+
+ Tika is able to make use of a a mime magic info file, in the
+ {{{http://www.freedesktop.org/standards/shared-mime-info}Freedesktop MIME-info}}
+ format to peform mime magic detection. (Note that Tika supports a few
+ more match types than Freedesktop does)
+
+ This is provided within Tika by
+ {{{./api/org/apache/tika/detect/MagicDetector.html}org.apache.tika.detect.MagicDetector}}. It is most commonly access via
+ {{{./api/org/apache/tika/mime/MimeTypes.html}org.apache.tika.mime.MimeTypes}},
+ normally sourced from the <<<tika-mimetypes.xml>>> and <<<custom-mimetypes.xml>>>
+ files. For more information on defining your own custom mimetypes, see
+ {{{./parser_guide.html#Add_your_MIME-Type}the new parser guide}}.
+
+
+* {Resource Name Based Detection}
+
+ Where the name of the file is known, it is sometimes possible to guess
+ the file type from the name or extension. Within the
+ <<<tika-mimetypes.xml>>> file is a list of patterns which are used to
+ identify the type from the filename.
+
+ However, because files may be renamed, this method of detection is quick
+ but not always as accurate.
+
+ This is provided within Tika by
+ {{{./api/org/apache/tika/detect/NameDetector.html}org.apache.tika.detect.NameDetector}}.
+
+
+* {Known Content Type "Detection}
+
+ Sometimes, the mime type for a file is already known, such as when
+ downloading from a webserver, or when retrieving from a content store.
+ This information can be used by detectors, such as
+ {{{./api/org/apache/tika/mime/MimeTypes.html}org.apache.tika.mime.MimeTypes}},
+
+
+* {The default Mime Types Detector}
+
+ By default, the mime type detection in Tika is provided by
+ {{{./api/org/apache/tika/mime/MimeTypes.html}org.apache.tika.mime.MimeTypes}}.
+ This detector makes use of <<<tika-mimetypes.xml>>> to power
+ magic based and filename based detection.
+
+ Firstly, magic based detection is used on the start of the file.
+ If the file is an XML file, then the start of the XML is processed
+ to look for root elements. Next, if available, the filename
+ (from <<<Metadata.RESOURCE_NAME_KEY>>>) is
+ then used to improve the detail of the detection, such as when magic
+ detects a text file, and the filename hints it's really a CSV. Finally,
+ if available, the supplied content type (from <<<Metadata.CONTENT_TYPE>>>)
+ is used to further refine the type.
+
+
+* {Container Aware Detection}
+
+ Several common file formats are actually held within a common container
+ format. One example is the PowerPoint .ppt and Word .doc formats, which
+ are both held within an OLE2 container. Another is Apple iWork formats,
+ which are actually a series of XML files within a Zip file.
+
+ Using magic detection, it is easy to spot that a given file is an OLE2
+ document, or a Zip file. Using magic detection alone, it is very difficult
+ (and often impossible) to tell what kind of file lives inside the container.
+
+ For some use cases, speed is important, so having a quick way to know the
+ container type is sufficient. For other cases however, you don't mind
+ spending a bit of time (and memory!) processing the container to get a
+ more accurate answer on its contents. For these cases, the additional
+ container aware detectors contained in the <<<Tika Parsers>>> jar should
+ be used.
+
+ Tika provides a wrapping detector in the form of
+ {{{./api/org/apache/tika/detect/DefaultDetector.html}org.apache.tika.detect.DefaultDetector}}.
+ This uses the service loader to discover all available detectors, including
+ any available container aware ones, and tries them in turn. For container
+ aware detection, include the <<<Tika Parsers>>> jar and its dependencies
+ in your project, then use DefaultDetector along with a <<<TikaInputStream>>>.
+
+ Because these container detectors needs to read the whole file to open and
+ inspect the container, they must be used with a
+ {{{./api/org/apache/tika/io/TikaInputStream.html}org.apache.tika.io.TikaInputStream}}.
+ If called with a regular <<<InputStream>>>, then all work will be done
+ by the default Mime Magic detection only.
+
+ For more information on container formats and Tika, see
+ {{{http://wiki.apache.org/tika/MetadataDiscussion}}}
+
+
+* {The default Tika Detector}
+
+ Just as with Parsers, Tika provides a special detector
+ {{{./api/org/apache/tika/detect/DefaultDetector.html}org.apache.tika.detect.DefaultDetector}}
+ which auto-detects (based on service files) the available detectors at
+ runtime, and tries these in turn to identify the file type.
+
+ If only <<<Tika Core>>> is available, the Default Detector will work only
+ with Mime Magic and Resource Name detection. However, if <<<Tika Parsers>>>
+ (and its dependencies!) are available, additional detectors which known about
+ containers (such as zip and ole2) will be used as appropriate, provided that
+ detection is being performed with a
+ {{{./api/org/apache/tika/io/TikaInputStream.html}org.apache.tika.io.TikaInputStream}}.
+ Custom detectors can also be used as desired, they simply need to be listed
+ in a service file much as is done for
+ {{{./parser_guide.html#List_the_new_parser}custom parsers}}.
+
+
+* {Ways of triggering Detection}
+
+ The simplest way to detect is through the
+ {{{./api/org/apache/tika/Tika.html}Tika Facade class}}, which provides methods to
+ detect based on
+ {{{./api/org/apache/tika/Tika.html##detect(java.io.File)}File}},
+ {{{./api/org/apache/tika/Tika.html##detect(java.io.InputStream)}InputStream}},
+ {{{./api/org/apache/tika/Tika.html##detect(java.io.InputStream, java.lang.String)}InputStream and Filename}},
+ {{{./api/org/apache/tika/Tika.html##detect(java.lang.String)}Filename}} or a few others.
+ It works best with a File or
+ {{{./api/org/apache/tika/io/TikaInputStream.html}TikaInputStream}}.
+
+ Alternately, detection can be performed on a specific Detector, or using
+ <<<DefaultDetector>>> to have all available Detectors used. A typical pattern
+ would be something like:
+
+---
+TikaConfig tika = new TikaConfig();
+
+for (File f : myListOfFiles) {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, f.toString());
+ String mimetype = tika.getDetector().detect(
+ TikaInputStream.get(f), metadata);
+ System.out.println("File " + f + " is " + mimetype);
+}
+for (InputStream is : myListOfStreams) {
+ String mimetype = tika.getDetector().detect(
+ TikaInputStream.get(is), new Metadata());
+ System.out.println("Stream " + is + " is " + mimetype);
+}
+---
+
+* {Language Detection}
+
+ Tika is able to help identify the language of a piece of text, which
+ is useful when extracting text from document formats which do not include
+ language information in their metadata.
+
+ The language detection is provided by
+ {{{./api/org/apache/tika/language/LanguageIdentifier.html}org.apache.tika.language.LanguageIdentifier}}
+
+* {More Examples}
+
+ For more examples of Detection using Apache Tika, please take a look at
+ the {{{./examples.html}Tika Examples page}}.
Added: tika/site/src/site/apt/1.14/parser.apt
URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/1.14/parser.apt?rev=1769232&view=auto
==============================================================================
--- tika/site/src/site/apt/1.14/parser.apt (added)
+++ tika/site/src/site/apt/1.14/parser.apt Fri Nov 11 00:09:36 2016
@@ -0,0 +1,251 @@
+ --------------------
+ The Parser interface
+ --------------------
+
+~~ Licensed to the Apache Software Foundation (ASF) under one or more
+~~ contributor license agreements. See the NOTICE file distributed with
+~~ this work for additional information regarding copyright ownership.
+~~ The ASF licenses this file to You under the Apache License, Version 2.0
+~~ (the "License"); you may not use this file except in compliance with
+~~ the License. You may obtain a copy of the License at
+~~
+~~ http://www.apache.org/licenses/LICENSE-2.0
+~~
+~~ Unless required by applicable law or agreed to in writing, software
+~~ distributed under the License is distributed on an "AS IS" BASIS,
+~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+~~ See the License for the specific language governing permissions and
+~~ limitations under the License.
+
+The Parser interface
+
+ The
+ {{{./api/org/apache/tika/parser/Parser.html}org.apache.tika.parser.Parser}}
+ interface is the key concept of Apache Tika. It hides the complexity of
+ different file formats and parsing libraries while providing a simple and
+ powerful mechanism for client applications to extract structured text
+ content and metadata from all sorts of documents. All this is achieved
+ with a single method:
+
+---
+void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException;
+---
+
+ The <<<parse>>> method takes the document to be parsed and related metadata
+ as input and outputs the results as XHTML SAX events and extra metadata.
+ The parse context argument is used to specify context information (like
+ the current local) that is not related to any individual document.
+ The main criteria that lead to this design were:
+
+ [Streamed parsing] The interface should require neither the client
+ application nor the parser implementation to keep the full document
+ content in memory or spooled to disk. This allows even huge documents
+ to be parsed without excessive resource requirements.
+
+ [Structured content] A parser implementation should be able to
+ include structural information (headings, links, etc.) in the extracted
+ content. A client application can use this information for example to
+ better judge the relevance of different parts of the parsed document.
+
+ [Input metadata] A client application should be able to include metadata
+ like the file name or declared content type with the document to be
+ parsed. The parser implementation can use this information to better
+ guide the parsing process.
+
+ [Output metadata] A parser implementation should be able to return
+ document metadata in addition to document content. Many document
+ formats contain metadata like the name of the author that may be useful
+ to client applications.
+
+ [Context sensitivity] While the default settings and behaviour of Tika
+ parsers should work well for most use cases, there are still situations
+ where more fine-grained control over the parsing process is desirable.
+ It should be easy to inject such context-specific information to the
+ parsing process without breaking the layers of abstraction.
+
+ []
+
+ These criteria are reflected in the arguments of the <<<parse>>> method.
+
+* Document input stream
+
+ The first argument is an
+ {{{http://docs.oracle.com/javase/6/docs/api/java/io/InputStream.html}InputStream}}
+ for reading the document to be parsed.
+
+ If this document stream can not be read, then parsing stops and the thrown
+ {{{http://docs.oracle.com/javase/6/docs/api/java/io/IOException.html}IOException}}
+ is passed up to the client application. If the stream can be read but
+ not parsed (for example if the document is corrupted), then the parser
+ throws a {{{./api/org/apache/tika/exception/TikaException.html}TikaException}}.
+
+ The parser implementation will consume this stream but <will not close it>.
+ Closing the stream is the responsibility of the client application that
+ opened it in the first place. The recommended pattern for using streams
+ with the <<<parse>>> method is:
+
+---
+InputStream stream = ...; // open the stream
+try {
+ parser.parse(stream, ...); // parse the stream
+} finally {
+ stream.close(); // close the stream
+}
+---
+
+ Some document formats like the OLE2 Compound Document Format used by
+ Microsoft Office are best parsed as random access files. In such cases the
+ content of the input stream is automatically spooled to a temporary file
+ that gets removed once parsed. A future version of Tika may make it possible
+ to avoid this extra file if the input document is already a file in the
+ local file system. See
+ {{{https://issues.apache.org/jira/browse/TIKA-153}TIKA-153}} for the status
+ of this feature request.
+
+* XHTML SAX events
+
+ The parsed content of the document stream is returned to the client
+ application as a sequence of XHTML SAX events. XHTML is used to express
+ structured content of the document and SAX events enable streamed
+ processing. Note that the XHTML format is used here only to convey
+ structural information, not to render the documents for browsing!
+
+ The XHTML SAX events produced by the parser implementation are sent to a
+ {{{http://docs.oracle.com/javase/6/docs/api/org/xml/sax/ContentHandler.html}ContentHandler}}
+ instance given to the <<<parse>>> method. If this the content handler
+ fails to process an event, then parsing stops and the thrown
+ {{{http://docs.oracle.com/javase/6/docs/api/org/xml/sax/SAXException.html}SAXException}}
+ is passed up to the client application.
+
+ The overall structure of the generated event stream is (with indenting
+ added for clarity):
+
+---
+<html xmlns="http://www.w3.org/1999/xhtml">
+ <head>
+ <title>...</title>
+ </head>
+ <body>
+ ...
+ </body>
+</html>
+---
+
+ Parser implementations typically use the
+ {{{./api/org/apache/tika/sax/XHTMLContentHandler.html}XHTMLContentHandler}}
+ utility class to generate the XHTML output.
+
+ Dealing with the raw SAX events can be a bit complex, so Apache Tika
+ comes with a number of utility classes that can be used to process and
+ convert the event stream to other representations.
+
+ For example, the
+ {{{./api/org/apache/tika/sax/BodyContentHandler.html}BodyContentHandler}}
+ class can be used to extract just the body part of the XHTML output and
+ feed it either as SAX events to another content handler or as characters
+ to an output stream, a writer, or simply a string. The following code
+ snippet parses a document from the standard input stream and outputs the
+ extracted text content to standard output:
+
+---
+ContentHandler handler = new BodyContentHandler(System.out);
+parser.parse(System.in, handler, ...);
+---
+
+ Another useful class is
+ {{{./api/org/apache/tika/parser/ParsingReader.html}ParsingReader}} that
+ uses a background thread to parse the document and returns the extracted
+ text content as a character stream:
+
+---
+InputStream stream = ...; // the document to be parsed
+Reader reader = new ParsingReader(parser, stream, ...);
+try {
+ ...; // read the document text using the reader
+} finally {
+ reader.close(); // the document stream is closed automatically
+}
+---
+
+* Document metadata
+
+ The third argument to the <<<parse>>> method is used to pass document
+ metadata both in and out of the parser. Document metadata is expressed
+ as an {{{./api/org/apache/tika/metadata/Metadata.html}Metadata}} object.
+
+ The following are some of the more interesting metadata properties:
+
+ [Metadata.RESOURCE_NAME_KEY] The name of the file or resource that contains
+ the document.
+
+ A client application can set this property to allow the parser to use
+ file name heuristics to determine the format of the document.
+
+ The parser implementation may set this property if the file format
+ contains the canonical name of the file (for example the Gzip format
+ has a slot for the file name).
+
+ [Metadata.CONTENT_TYPE] The declared content type of the document.
+
+ A client application can set this property based on for example a HTTP
+ Content-Type header. The declared content type may help the parser to
+ correctly interpret the document.
+
+ The parser implementation sets this property to the content type according
+ to which the document was parsed.
+
+ [Metadata.TITLE] The title of the document.
+
+ The parser implementation sets this property if the document format
+ contains an explicit title field.
+
+ [Metadata.AUTHOR] The name of the author of the document.
+
+ The parser implementation sets this property if the document format
+ contains an explicit author field.
+
+ []
+
+ Note that metadata handling is still being discussed by the Tika development
+ team, and it is likely that there will be some (backwards incompatible)
+ changes in metadata handling before Tika 1.0.
+
+* Parse context
+
+
+ The final argument to the <<<parse>>> method is used to inject
+ context-specific information to the parsing process. This is useful
+ for example when dealing with locale-specific date and number formats
+ in Microsoft Excel spreadsheets. Another important use of the parse
+ context is passing in the delegate parser instance to be used by
+ two-phase parsers like the
+ {{{./api/org/apache/parser/pkg/PackageParser.html}PackageParser}} subclasses.
+ Some parser classes allow customization of the parsing process through
+ strategy objects in the parse context.
+
+* Parser implementations
+
+ Apache Tika comes with a number of parser classes for parsing
+ {{{./formats.html}various document formats}}. You can also extend Tika
+ with your own parsers, and of course any contributions to Tika are
+ warmly welcome.
+
+ The goal of Tika is to reuse existing parser libraries like
+ {{{http://pdfbox.apache.org/}PDFBox}} or
+ {{{http://poi.apache.org/}Apache POI}} as much as possible, and so most
+ of the parser classes in Tika are adapters to such external libraries.
+
+ Tika also contains some general purpose parser implementations that are
+ not targeted at any specific document formats. The most notable of these
+ is the {{{./api/org/apache/tika/parser/AutoDetectParser.html}AutoDetectParser}}
+ class that encapsulates all Tika functionality into a single parser that
+ can handle any types of documents. This parser will automatically determine
+ the type of the incoming document based on various heuristics and will then
+ parse the document accordingly.
+
+* {More Examples}
+
+ For more examples of calling Parsing with Apache Tika, please take a look at
+ the {{{./examples.html}Tika Examples page}}.
Added: tika/site/src/site/apt/1.14/parser_guide.apt
URL: http://svn.apache.org/viewvc/tika/site/src/site/apt/1.14/parser_guide.apt?rev=1769232&view=auto
==============================================================================
--- tika/site/src/site/apt/1.14/parser_guide.apt (added)
+++ tika/site/src/site/apt/1.14/parser_guide.apt Fri Nov 11 00:09:36 2016
@@ -0,0 +1,141 @@
+ --------------------------------------------
+ Get Tika parsing up and running in 5 minutes
+ --------------------------------------------
+
+~~ Licensed to the Apache Software Foundation (ASF) under one or more
+~~ contributor license agreements. See the NOTICE file distributed with
+~~ this work for additional information regarding copyright ownership.
+~~ The ASF licenses this file to You under the Apache License, Version 2.0
+~~ (the "License"); you may not use this file except in compliance with
+~~ the License. You may obtain a copy of the License at
+~~
+~~ http://www.apache.org/licenses/LICENSE-2.0
+~~
+~~ Unless required by applicable law or agreed to in writing, software
+~~ distributed under the License is distributed on an "AS IS" BASIS,
+~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+~~ See the License for the specific language governing permissions and
+~~ limitations under the License.
+
+Get Tika parsing up and running in 5 minutes
+
+ This page is a quick start guide showing how to add a new parser to Apache Tika.
+ Following the simple steps listed below your new parser can be running in only 5 minutes.
+
+%{toc|section=1|fromDepth=1}
+
+* {Getting Started}
+
+ The {{{./gettingstarted.html}Getting Started}} document describes how to
+ build Apache Tika from sources and how to start using Tika in an application. Pay close attention
+ and follow the instructions in the "Getting and building the sources" section.
+
+
+* {Add your MIME-Type}
+
+ Tika loads the core, standard MIME-Types from the file
+ "org/apache/tika/mime/tika-mimetypes.xml", which comes from
+ {{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml;hb=refs/heads/master}tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml}} .
+ If your new MIME-Type is a standard one which is missing from Tika,
+ submit a patch for this file!
+
+ If your MIME-Type needs adding, create a new file
+ "org/apache/tika/mime/custom-mimetypes.xml" in your codebase.
+ You should add to it something like this:
+
+---
+ <?xml version="1.0" encoding="UTF-8"?>
+ <mime-info>
+ <mime-type type="application/hello">
+ <glob pattern="*.hi"/>
+ </mime-type>
+ </mime-info>
+---
+
+* {Create your Parser class}
+
+ Now, you need to create your new parser. This is a class that must
+ implement the Parser interface offered by Tika. Instead of implementing
+ the Parser interface directly, it is recommended that you extend the
+ abstract class AbstractParser if possible. AbstractParser handles
+ translating between API changes for you.
+
+ A very simple Tika Parser looks like this:
+
+---
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * @Author: Arturo Beltran
+ */
+package org.apache.tika.parser.hello;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class HelloParser extends AbstractParser {
+
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("hello"));
+ public static final String HELLO_MIME_TYPE = "application/hello";
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ metadata.set(Metadata.CONTENT_TYPE, HELLO_MIME_TYPE);
+ metadata.set("Hello", "World");
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+}
+---
+
+ Pay special attention to the definition of the SUPPORTED_TYPES static class
+ field in the parser class that defines what MIME-Types it supports. If
+ your MIME-Types aren't standard ones, ensure you listed them in a
+ "custom-mimetypes.xml" file so that Tika knows about them (see above).
+
+ Is in the "parse" method where you will do all your work. This is, extract
+ the information of the resource and then set the metadata.
+
+* {List the new parser}
+
+ Finally, you should explicitly tell the AutoDetectParser to include your new
+ parser. This step is only needed if you want to use the AutoDetectParser functionality.
+ If you figure out the correct parser in a different way, it isn't needed.
+
+ List your new parser in:
+ {{{https://git-wip-us.apache.org/repos/asf?p=tika.git;a=blob;f=tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser;hb=refs/heads/master}tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser}}
+
+