You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by ju...@apache.org on 2009/04/08 15:37:42 UTC
svn commit: r763242 - in
/jackrabbit/trunk/jackrabbit-text-extractors/src/main:
java/org/apache/jackrabbit/extractor/ resources/org/ resources/org/apache/
resources/org/apache/jackrabbit/ resources/org/apache/jackrabbit/extractor/
Author: jukka
Date: Wed Apr 8 13:37:41 2009
New Revision: 763242
URL: http://svn.apache.org/viewvc?rev=763242&view=rev
Log:
JCR-1878: Use Apache Tika for text extraction
Work around the POI loading issue in Java 1.4 by using a custom Tika configuration file.
Added:
jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/
jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/
jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/
jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/
jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config-jdk14.xml (with props)
Modified:
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java
Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java?rev=763242&r1=763241&r2=763242&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java Wed Apr 8 13:37:41 2009
@@ -21,6 +21,7 @@
import java.io.Reader;
import java.util.Set;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
@@ -42,8 +43,32 @@
private static final String[] TYPES;
static {
- AutoDetectParser parser = new AutoDetectParser();
+ // The default Tika configuration refers to Apache POI libraries that
+ // are compiled for Java 5, and can thus not be loaded in Java 1.4.
+ // This makes it impossible to load the default Tika configuration
+ // (see TIKA-217 for background), and so we need to use the following
+ // workaround to instantiate the Tika AutoDetectParser without the
+ // POI classes (and thus support for MS Office formats) when running
+ // on Java 1.4.
+ AutoDetectParser parser;
+ if ("1.4".equals(System.getProperty("java.specification.version"))) {
+ InputStream stream =
+ DefaultTextExtractor.class.getResourceAsStream("tika-config-jdk14.xml");
+ try {
+ try {
+ parser = new AutoDetectParser(new TikaConfig(stream));
+ } finally {
+ stream.close();
+ }
+ } catch (Exception e) {
+ throw new RuntimeException(
+ "Unable to load Tika configuration", e);
+ }
+ } else {
+ parser = new AutoDetectParser();
+ }
PARSER = parser;
+
Set types = parser.getParsers().keySet();
TYPES = (String[]) types.toArray(new String[types.size()]);
}
Added: jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config-jdk14.xml
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config-jdk14.xml?rev=763242&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config-jdk14.xml (added)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config-jdk14.xml Wed Apr 8 13:37:41 2009
@@ -0,0 +1,134 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<properties>
+
+ <mimeTypeRepository resource="/org/apache/tika/mime/tika-mimetypes.xml" magic="false"/>
+
+ <parsers>
+
+ <parser name="parse-dcxml" class="org.apache.tika.parser.xml.DcXMLParser">
+ <mime>application/xml</mime>
+ <mime>image/svg+xml</mime>
+ </parser>
+
+ <parser name="parse-html" class="org.apache.tika.parser.html.HtmlParser">
+ <mime>text/html</mime>
+ <mime>application/xhtml+xml</mime>
+ <mime>application/x-asp</mime>
+ </parser>
+
+ <parser mame="parse-rtf" class="org.apache.tika.parser.rtf.RTFParser">
+ <mime>application/rtf</mime>
+ </parser>
+
+ <parser name="parse-pdf" class="org.apache.tika.parser.pdf.PDFParser">
+ <mime>application/pdf</mime>
+ </parser>
+
+ <parser name="parse-txt" class="org.apache.tika.parser.txt.TXTParser">
+ <mime>text/plain</mime>
+ </parser>
+
+ <parser name="parse-openoffice" class="org.apache.tika.parser.opendocument.OpenOfficeParser">
+ <mime>application/vnd.sun.xml.writer</mime>
+ <mime>application/vnd.oasis.opendocument.text</mime>
+ <mime>application/vnd.oasis.opendocument.graphics</mime>
+ <mime>application/vnd.oasis.opendocument.presentation</mime>
+ <mime>application/vnd.oasis.opendocument.spreadsheet</mime>
+ <mime>application/vnd.oasis.opendocument.chart</mime>
+ <mime>application/vnd.oasis.opendocument.image</mime>
+ <mime>application/vnd.oasis.opendocument.formula</mime>
+ <mime>application/vnd.oasis.opendocument.text-master</mime>
+ <mime>application/vnd.oasis.opendocument.text-web</mime>
+ <mime>application/vnd.oasis.opendocument.text-template</mime>
+ <mime>application/vnd.oasis.opendocument.graphics-template</mime>
+ <mime>application/vnd.oasis.opendocument.presentation-template</mime>
+ <mime>application/vnd.oasis.opendocument.spreadsheet-template</mime>
+ <mime>application/vnd.oasis.opendocument.chart-template</mime>
+ <mime>application/vnd.oasis.opendocument.image-template</mime>
+ <mime>application/vnd.oasis.opendocument.formula-template</mime>
+ <mime>application/x-vnd.oasis.opendocument.text</mime>
+ <mime>application/x-vnd.oasis.opendocument.graphics</mime>
+ <mime>application/x-vnd.oasis.opendocument.presentation</mime>
+ <mime>application/x-vnd.oasis.opendocument.spreadsheet</mime>
+ <mime>application/x-vnd.oasis.opendocument.chart</mime>
+ <mime>application/x-vnd.oasis.opendocument.image</mime>
+ <mime>application/x-vnd.oasis.opendocument.formula</mime>
+ <mime>application/x-vnd.oasis.opendocument.text-master</mime>
+ <mime>application/x-vnd.oasis.opendocument.text-web</mime>
+ <mime>application/x-vnd.oasis.opendocument.text-template</mime>
+ <mime>application/x-vnd.oasis.opendocument.graphics-template</mime>
+ <mime>application/x-vnd.oasis.opendocument.presentation-template</mime>
+ <mime>application/x-vnd.oasis.opendocument.spreadsheet-template</mime>
+ <mime>application/x-vnd.oasis.opendocument.chart-template</mime>
+ <mime>application/x-vnd.oasis.opendocument.image-template</mime>
+ <mime>application/x-vnd.oasis.opendocument.formula-template</mime>
+ </parser>
+
+ <parser name="parse-image" class="org.apache.tika.parser.image.ImageParser">
+ <mime>image/bmp</mime>
+ <mime>image/gif</mime>
+ <mime>image/jpeg</mime>
+ <mime>image/png</mime>
+ <mime>image/tiff</mime>
+ <mime>image/vnd.wap.wbmp</mime>
+ <mime>image/x-icon</mime>
+ <mime>image/x-psd</mime>
+ <mime>image/x-xcf</mime>
+ </parser>
+
+ <parser name="parse-zip" class="org.apache.tika.parser.pkg.ZipParser">
+ <mime>application/zip</mime>
+ </parser>
+
+ <parser name="parse-tar" class="org.apache.tika.parser.pkg.TarParser">
+ <mime>application/x-tar</mime>
+ </parser>
+
+ <parser name="parse-gzip" class="org.apache.tika.parser.pkg.GzipParser">
+ <mime>application/x-gzip</mime>
+ </parser>
+
+ <parser name="parse-bzip2" class="org.apache.tika.parser.pkg.Bzip2Parser">
+ <mime>application/x-bzip</mime>
+ </parser>
+
+ <parser name="parse-class" class="org.apache.tika.parser.asm.ClassParser">
+ <mime>application/x-tika-java-class</mime>
+ </parser>
+
+ <parser name="parse-mp3" class="org.apache.tika.parser.mp3.Mp3Parser">
+ <mime>audio/mpeg</mime>
+ </parser>
+
+ <parser name="parse-midi" class="org.apache.tika.parser.audio.MidiParser">
+ <mime>application/x-midi</mime>
+ <mime>audio/midi</mime>
+ </parser>
+
+ <parser name="parse-audio" class="org.apache.tika.parser.audio.AudioParser">
+ <mime>audio/basic</mime>
+ <mime>audio/x-wav</mime>
+ <mime>audio/x-aiff</mime>
+ </parser>
+
+ </parsers>
+
+</properties>
\ No newline at end of file
Propchange: jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config-jdk14.xml
------------------------------------------------------------------------------
svn:eol-style = native