You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by ju...@apache.org on 2009/04/08 06:11:11 UTC

svn commit: r762814 - in /jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor: DefaultTextExtractor.java MsTextExtractor.java TikaTextExtractor.java

Author: jukka
Date: Tue Apr  7 15:21:51 2009
New Revision: 762814

URL: http://svn.apache.org/viewvc?rev=762814&view=rev
Log:
JCR-1878: Use Apache Tika for text extraction

Make TikaTextExtractor into the DefaultTextExtractor.

Removed:
    jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java
Modified:
    jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java
    jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java

Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java?rev=762814&r1=762813&r2=762814&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java Tue Apr  7 15:21:51 2009
@@ -16,11 +16,50 @@
  */
 package org.apache.jackrabbit.extractor;
 
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParsingReader;
+
 /**
- * Composite text extractor that by default contains the standard
- * text extractors found in this package.
- *
- * @deprecated Use {@link TikaTextExtractor} instead
+ * Default text extractor based on Apache Tika.
  */
-public class DefaultTextExtractor extends TikaTextExtractor {
+public class DefaultTextExtractor implements TextExtractor {
+
+    /**
+     * Auto-detecting parser.
+     */
+    private static final Parser PARSER;
+
+    /**
+     * Supported content types.
+     */
+    private static final String[] TYPES;
+
+    static {
+        AutoDetectParser parser = new AutoDetectParser();
+        PARSER = parser;
+        Set types = parser.getParsers().keySet();
+        TYPES = (String[]) types.toArray(new String[types.size()]);
+    }
+
+    public String[] getContentTypes() {
+        return TYPES;
+    }
+
+    public Reader extractText(InputStream stream, String type, String encoding)
+            throws IOException {
+        Metadata metadata = new Metadata();
+        if (type != null && type.trim().length() > 0) {
+            metadata.set(Metadata.CONTENT_TYPE, type.trim());
+        }
+        // TODO: This creates a background thread. Is that a problem?
+        return new ParsingReader(PARSER, stream, metadata);
+    }
+
 }

Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java?rev=762814&r1=762813&r2=762814&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java Tue Apr  7 15:21:51 2009
@@ -20,7 +20,7 @@
 /**
  * Text extractor for Microsoft Word documents.
  */
-public class MsTextExtractor extends TikaTextExtractor {
+public class MsTextExtractor extends DefaultTextExtractor {
 
     private static String[] TYPES = new String[] {
         "application/vnd.ms-word",