You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by ju...@apache.org on 2009/04/08 06:11:10 UTC

svn commit: r762808 - /jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java

Author: jukka
Date: Tue Apr  7 15:05:15 2009
New Revision: 762808

URL: http://svn.apache.org/viewvc?rev=762808&view=rev
Log:
JCR-1887: msoffice text extractor for office 2007 files

Replace the implementation with a Apache Tika from TIKA-1878. This way we won't get compile errors due to the Java 5 POI libraries.

Modified:
    jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java

Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java?rev=762808&r1=762807&r2=762808&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java Tue Apr  7 15:05:15 2009
@@ -16,67 +16,25 @@
  */
 package org.apache.jackrabbit.extractor;
 
-import org.apache.poi.extractor.ExtractorFactory;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.Reader;
-import java.io.InputStream;
-import java.io.IOException;
-import java.io.StringReader;
 
 /**
  * Text extractor for Microsoft Word documents.
  */
-public class MsTextExtractor extends AbstractTextExtractor {
-
-    /**
-     * Logger instance.
-     */
-    private static final Logger logger =
-        LoggerFactory.getLogger(MsTextExtractor.class);
-
-    /**
-     * Force loading of dependent class.
-     */
-    static {
-        ExtractorFactory.class.getName();
-    }
-
-    /**
-     * Creates a new <code>MsWordTextExtractor</code> instance.
-     */
-    public MsTextExtractor() {
-        super(new String[]{"application/vnd.ms-word", 
-                           "application/msword",
-                           "application/vnd.ms-powerpoint",
-                           "application/mspowerpoint",
-                           "application/vnd.ms-excel",
-                           "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-                           "application/vnd.openxmlformats-officedocument.presentationml.presentation",
-                           "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"});
-    }
+public class MsTextExtractor extends TikaTextExtractor {
 
-    //-------------------------------------------------------< TextExtractor >
+    private static String[] TYPES = new String[] {
+        "application/vnd.ms-word", 
+        "application/msword",
+        "application/vnd.ms-powerpoint",
+        "application/mspowerpoint",
+        "application/vnd.ms-excel",
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+    };
 
-    /**
-     * {@inheritDoc}
-     * Returns an empty reader if an error occured extracting text from
-     * the word document.
-     */
-    public Reader extractText(InputStream stream,
-                              String type,
-                              String encoding) throws IOException {
-        try {
-            String text = ExtractorFactory.createExtractor(stream).getText();
-            return new StringReader(text);
-        } catch (Exception e) {
-            logger.warn("Failed to extract Microsoft Document text content", e);
-            return new StringReader("");
-        } finally {
-            stream.close();
-        }
+    public String[] getContentTypes() {
+        return TYPES;
     }
 
 }