You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by ju...@apache.org on 2009/04/08 15:37:42 UTC

svn commit: r763242 - in /jackrabbit/trunk/jackrabbit-text-extractors/src/main: java/org/apache/jackrabbit/extractor/ resources/org/ resources/org/apache/ resources/org/apache/jackrabbit/ resources/org/apache/jackrabbit/extractor/

Author: jukka
Date: Wed Apr  8 13:37:41 2009
New Revision: 763242

URL: http://svn.apache.org/viewvc?rev=763242&view=rev
Log:
JCR-1878: Use Apache Tika for text extraction

Work around the POI loading issue in Java 1.4 by using a custom Tika configuration file.

Added:
    jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/
    jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/
    jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/
    jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/
    jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config-jdk14.xml   (with props)
Modified:
    jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java

Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java?rev=763242&r1=763241&r2=763242&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java Wed Apr  8 13:37:41 2009
@@ -21,6 +21,7 @@
 import java.io.Reader;
 import java.util.Set;
 
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
@@ -42,8 +43,32 @@
     private static final String[] TYPES;
 
     static {
-        AutoDetectParser parser = new AutoDetectParser();
+        // The default Tika configuration refers to Apache POI libraries that
+        // are compiled for Java 5, and can thus not be loaded in Java 1.4.
+        // This makes it impossible to load the default Tika configuration
+        // (see TIKA-217 for background), and so we need to use the following
+        // workaround to instantiate the Tika AutoDetectParser without the
+        // POI classes (and thus support for MS Office formats) when running
+        // on Java 1.4.
+        AutoDetectParser parser;
+        if ("1.4".equals(System.getProperty("java.specification.version"))) {
+            InputStream stream =
+                DefaultTextExtractor.class.getResourceAsStream("tika-config-jdk14.xml");
+            try {
+                try {
+                    parser = new AutoDetectParser(new TikaConfig(stream));
+                } finally {
+                    stream.close();
+                }
+            } catch (Exception e) {
+                throw new RuntimeException(
+                        "Unable to load Tika configuration", e);
+            }
+        } else {
+            parser = new AutoDetectParser();
+        }
         PARSER = parser;
+
         Set types = parser.getParsers().keySet();
         TYPES = (String[]) types.toArray(new String[types.size()]);
     }

Added: jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config-jdk14.xml
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config-jdk14.xml?rev=763242&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config-jdk14.xml (added)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config-jdk14.xml Wed Apr  8 13:37:41 2009
@@ -0,0 +1,134 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<properties>
+
+  <mimeTypeRepository resource="/org/apache/tika/mime/tika-mimetypes.xml" magic="false"/>
+
+  <parsers>
+
+    <parser name="parse-dcxml" class="org.apache.tika.parser.xml.DcXMLParser">
+      <mime>application/xml</mime>
+      <mime>image/svg+xml</mime>
+    </parser>
+
+    <parser name="parse-html" class="org.apache.tika.parser.html.HtmlParser">
+      <mime>text/html</mime>
+      <mime>application/xhtml+xml</mime>
+      <mime>application/x-asp</mime>
+    </parser>
+
+    <parser mame="parse-rtf" class="org.apache.tika.parser.rtf.RTFParser">
+      <mime>application/rtf</mime>
+    </parser>
+
+    <parser name="parse-pdf" class="org.apache.tika.parser.pdf.PDFParser">
+      <mime>application/pdf</mime>
+    </parser>
+
+    <parser name="parse-txt" class="org.apache.tika.parser.txt.TXTParser">
+      <mime>text/plain</mime>
+    </parser>
+
+    <parser name="parse-openoffice" class="org.apache.tika.parser.opendocument.OpenOfficeParser">
+      <mime>application/vnd.sun.xml.writer</mime>
+      <mime>application/vnd.oasis.opendocument.text</mime>
+      <mime>application/vnd.oasis.opendocument.graphics</mime>
+      <mime>application/vnd.oasis.opendocument.presentation</mime>
+      <mime>application/vnd.oasis.opendocument.spreadsheet</mime>
+      <mime>application/vnd.oasis.opendocument.chart</mime>
+      <mime>application/vnd.oasis.opendocument.image</mime>
+      <mime>application/vnd.oasis.opendocument.formula</mime>
+      <mime>application/vnd.oasis.opendocument.text-master</mime>
+      <mime>application/vnd.oasis.opendocument.text-web</mime>
+      <mime>application/vnd.oasis.opendocument.text-template</mime>
+      <mime>application/vnd.oasis.opendocument.graphics-template</mime>
+      <mime>application/vnd.oasis.opendocument.presentation-template</mime>
+      <mime>application/vnd.oasis.opendocument.spreadsheet-template</mime>
+      <mime>application/vnd.oasis.opendocument.chart-template</mime>
+      <mime>application/vnd.oasis.opendocument.image-template</mime>
+      <mime>application/vnd.oasis.opendocument.formula-template</mime>
+      <mime>application/x-vnd.oasis.opendocument.text</mime>
+      <mime>application/x-vnd.oasis.opendocument.graphics</mime>
+      <mime>application/x-vnd.oasis.opendocument.presentation</mime>
+      <mime>application/x-vnd.oasis.opendocument.spreadsheet</mime>
+      <mime>application/x-vnd.oasis.opendocument.chart</mime>
+      <mime>application/x-vnd.oasis.opendocument.image</mime>
+      <mime>application/x-vnd.oasis.opendocument.formula</mime>
+      <mime>application/x-vnd.oasis.opendocument.text-master</mime>
+      <mime>application/x-vnd.oasis.opendocument.text-web</mime>
+      <mime>application/x-vnd.oasis.opendocument.text-template</mime>
+      <mime>application/x-vnd.oasis.opendocument.graphics-template</mime>
+      <mime>application/x-vnd.oasis.opendocument.presentation-template</mime>
+      <mime>application/x-vnd.oasis.opendocument.spreadsheet-template</mime>
+      <mime>application/x-vnd.oasis.opendocument.chart-template</mime>
+      <mime>application/x-vnd.oasis.opendocument.image-template</mime>
+      <mime>application/x-vnd.oasis.opendocument.formula-template</mime>
+    </parser>
+
+    <parser name="parse-image" class="org.apache.tika.parser.image.ImageParser">
+      <mime>image/bmp</mime>
+      <mime>image/gif</mime>
+      <mime>image/jpeg</mime>
+      <mime>image/png</mime>
+      <mime>image/tiff</mime>
+      <mime>image/vnd.wap.wbmp</mime>
+      <mime>image/x-icon</mime>
+      <mime>image/x-psd</mime>
+      <mime>image/x-xcf</mime>
+    </parser>
+
+    <parser name="parse-zip" class="org.apache.tika.parser.pkg.ZipParser">
+      <mime>application/zip</mime>
+    </parser>
+
+    <parser name="parse-tar" class="org.apache.tika.parser.pkg.TarParser">
+      <mime>application/x-tar</mime>
+    </parser>
+
+    <parser name="parse-gzip" class="org.apache.tika.parser.pkg.GzipParser">
+      <mime>application/x-gzip</mime>
+    </parser>
+
+    <parser name="parse-bzip2" class="org.apache.tika.parser.pkg.Bzip2Parser">
+      <mime>application/x-bzip</mime>
+    </parser>
+
+    <parser name="parse-class" class="org.apache.tika.parser.asm.ClassParser">
+      <mime>application/x-tika-java-class</mime>
+    </parser>
+
+    <parser name="parse-mp3" class="org.apache.tika.parser.mp3.Mp3Parser">
+      <mime>audio/mpeg</mime>
+    </parser>
+
+    <parser name="parse-midi" class="org.apache.tika.parser.audio.MidiParser">
+      <mime>application/x-midi</mime>
+      <mime>audio/midi</mime>
+    </parser>
+
+    <parser name="parse-audio" class="org.apache.tika.parser.audio.AudioParser">
+      <mime>audio/basic</mime>
+      <mime>audio/x-wav</mime>
+      <mime>audio/x-aiff</mime>
+    </parser>
+
+  </parsers>
+
+</properties>
\ No newline at end of file

Propchange: jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config-jdk14.xml
------------------------------------------------------------------------------
    svn:eol-style = native